1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
11 library(phenoAnalysis
)
14 library(genoDataFilter
)
16 allArgs
<- commandArgs()
18 inFile
<- grep("input_files",
25 inFiles
<- scan(inFile
,
29 outFile
<- grep("output_files",
36 outFiles
<- scan(outFile
,
40 combinedGenoFile
<- grep("genotype_data",
47 combinedPhenoFile
<- grep("model_phenodata",
54 analysisReportFile
<- grep("_report_",
61 ## traitFile <- grep("model_phenodata",
63 ## ignore.case = TRUE,
68 ## trait <- scan(traitFile,
69 ## what = "character",
72 ## traitInfo <- strsplit(trait, "\t");
73 ## traitId <- traitInfo[[1]]
74 ## traitName <- traitInfo[[2]]
76 modelInfoFile
<- grep("model_info", inFiles
, value
= TRUE)
77 message('model_info_file ', modelInfoFile
)
79 traitRawPhenoFile
<- grep('trait_raw_phenodata', outFiles
, value
= TRUE)
81 modelInfo
<- read
.table(modelInfoFile
,
82 header
=TRUE, sep
="\t",
85 modelInfo
<- column_to_rownames(modelInfo
, var
="Name")
86 traitId
<- modelInfo
["trait_id", 1]
87 traitAbbr
<- modelInfo
["trait_abbr", 1]
88 modelId
<- modelInfo
["model_id", 1]
89 protocolId
<- modelInfo
["protocol_id", 1]
91 message('class ', class(traitAbbr
))
92 message('trait_id ', traitId
)
93 message('trait_abbr ', traitAbbr
)
94 message('protocol_id ', protocolId
)
95 message('model_id ', modelId
)
97 #extract trait phenotype data from all populations
98 #and combine them into one dataset
100 allPhenoFiles
<- grep("phenotype_data",
106 message("phenotype files: ", allPhenoFiles
)
108 allGenoFiles
<- grep("genotype_data",
116 combinedPhenoPops
<- c()
118 traitRawPhenoData
<- c()
120 logHeading
<- paste0("Genomic Prediction Analysis Log for ", traitAbbr
, ".\n")
121 logHeading
<- append(logHeading
, paste0("Date: ", format(Sys
.time(), "%d %b %Y %H:%M"), "\n\n\n"))
122 logHeading
<- format(logHeading
, width
=80, justify
="c")
124 anovaLog
<- paste0("#Preprocessing training population phenotype data.\n\n")
125 anovaLog
<- append(anovaLog
, paste0("This training population is composed of ", length(allPhenoFiles
), " trials. Adjusted means or arithmetic trait values for each trial will be calculated as described below. After that, single trait values for the entries will be calculated by averaging trait adjusted means/arithmetic averages across the trials.\n\n"))
127 for (popPhenoFile
in allPhenoFiles
) {
131 phenoData
<- fread(popPhenoFile
,
134 na
.strings
= c("NA", "", "--", "-", "."))
136 phenoData
<- data
.frame(phenoData
)
138 meansResult
<- getAdjMeans(phenoData
,
139 traitName
= traitAbbr
,
143 anovaLog
<- append(anovaLog
, meansResult$log
)
144 phenoTrait
<- meansResult$adjMeans
146 keepMetaCols
<- c('observationUnitName', 'germplasmName', 'studyDbId', 'locationName',
147 'studyYear', 'replicate', 'blockNumber')
149 trialTraitRawPhenoData
<- phenoData
%>%
150 select(c(keepMetaCols
, traitAbbr
))
152 popIdFile
<- basename(popPhenoFile
)
153 popId
<- str_extract(popIdFile
, "\\d+")
154 popIds
<- c(popIds
, popId
)
156 newTraitName
<- paste(traitAbbr
, popId
, sep
= "_")
157 colnames(phenoTrait
)[2] <- newTraitName
160 combinedPhenoPops
<- phenoTrait
161 traitRawPhenoData
<- trialTraitRawPhenoData
163 combinedPhenoPops
<- full_join(combinedPhenoPops
, phenoTrait
, by
='germplasmName')
164 traitRawPhenoData
<- bind_rows(traitRawPhenoData
, trialTraitRawPhenoData
)
168 combinedPhenoPops
<- column_to_rownames(combinedPhenoPops
, var
='germplasmName')
170 # #fill in missing data in combined phenotype dataset
172 naIndices
<- which(is
.na(combinedPhenoPops
), arr
.ind
=TRUE)
173 combinedPhenoPops
<- as
.matrix(combinedPhenoPops
)
174 combinedPhenoPops
[naIndices
] <- rowMeans(combinedPhenoPops
, na
.rm
=TRUE)[naIndices
[,1]]
175 combinedPhenoPops
<- as
.data
.frame(combinedPhenoPops
)
177 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops
)))
179 combinedPhenoPops$Average
<-round(apply(combinedPhenoPops
, 1, function(x
) { mean(x
) }), digits
= 2)
181 combinedGenoPops
<- c()
183 if (file
.size(combinedGenoFile
) < 100 ) {
184 combinedGenoPops
<- combineGenoData(allGenoFiles
)
185 combinedGenoPops$trial
<- NULL
186 combinedGenoPops
<- combinedGenoPops
[order(rownames(combinedGenoPops
)), ]
189 message("writing data to files...")
190 #if(length(combinedPhenoFile) != 0 )
192 fwrite(combinedPhenoPops
,
193 file
= combinedPhenoFile
,
199 if (!is
.null(traitRawPhenoData
) & length(traitRawPhenoFile
) != 0) {
201 fwrite(traitRawPhenoData
,
202 file
= traitRawPhenoFile
,
209 if(!is
.null(combinedGenoPops
)) {
210 fwrite(combinedGenoPops
,
211 file
= combinedGenoFile
,
218 cat(anovaLog
, fill
= TRUE, file
= analysisReportFile
, append
=FALSE)
220 q(save
= "no", runLast
= FALSE)