Merge pull request #5191 from solgenomics/topic/quality_control
[sgn.git] / R / solGS / combine_populations.r
blob29f2c16304db7b06301829ba6ab5853b6ced2a94
1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
3 #populations
5 options(echo = FALSE)
7 library(stringr)
8 library(randomForest)
9 library(lme4)
10 library(data.table)
11 library(phenoAnalysis)
12 library(dplyr)
13 library(tibble)
14 library(genoDataFilter)
16 allArgs <- commandArgs()
18 inFile <- grep("input_files",
19 allArgs,
20 ignore.case = TRUE,
21 perl = TRUE,
22 value = TRUE
25 inFiles <- scan(inFile,
26 what = "character"
29 outFile <- grep("output_files",
30 allArgs,
31 ignore.case = TRUE,
32 perl = TRUE,
33 value = TRUE
36 outFiles <- scan(outFile,
37 what = "character"
40 combinedGenoFile <- grep("genotype_data",
41 outFiles,
42 ignore.case = TRUE,
43 fixed = FALSE,
44 value = TRUE
47 combinedPhenoFile <- grep("model_phenodata",
48 outFiles,
49 ignore.case = TRUE,
50 fixed = FALSE,
51 value = TRUE
54 analysisReportFile <- grep("_report_",
55 outFiles,
56 ignore.case = TRUE,
57 fixed = FALSE,
58 value = TRUE
61 ## traitFile <- grep("model_phenodata",
62 ## inFiles,
63 ## ignore.case = TRUE,
64 ## fixed = FALSE,
65 ## value = TRUE
66 ## )
68 ## trait <- scan(traitFile,
69 ## what = "character",
70 ## )
72 ## traitInfo <- strsplit(trait, "\t");
73 ## traitId <- traitInfo[[1]]
74 ## traitName <- traitInfo[[2]]
76 modelInfoFile <- grep("model_info", inFiles, value = TRUE)
77 message('model_info_file ', modelInfoFile)
79 traitRawPhenoFile <- grep('trait_raw_phenodata', outFiles, value = TRUE)
81 modelInfo <- read.table(modelInfoFile,
82 header=TRUE, sep ="\t",
83 as.is = c('Value'))
85 modelInfo <- column_to_rownames(modelInfo, var="Name")
86 traitId <- modelInfo["trait_id", 1]
87 traitAbbr <- modelInfo["trait_abbr", 1]
88 modelId <- modelInfo["model_id", 1]
89 protocolId <- modelInfo["protocol_id", 1]
91 message('class ', class(traitAbbr))
92 message('trait_id ', traitId)
93 message('trait_abbr ', traitAbbr)
94 message('protocol_id ', protocolId)
95 message('model_id ', modelId)
97 #extract trait phenotype data from all populations
98 #and combine them into one dataset
100 allPhenoFiles <- grep("phenotype_data",
101 inFiles,
102 ignore.case = TRUE,
103 fixed = FALSE,
104 value = TRUE
106 message("phenotype files: ", allPhenoFiles)
108 allGenoFiles <- grep("genotype_data",
109 inFiles,
110 ignore.case = TRUE,
111 fixed = FALSE,
112 value = TRUE
115 popIds <- c()
116 combinedPhenoPops <- c()
117 cnt <- 0
118 traitRawPhenoData <- c()
120 logHeading <- paste0("Genomic Prediction Analysis Log for ", traitAbbr, ".\n")
121 logHeading <- append(logHeading, paste0("Date: ", format(Sys.time(), "%d %b %Y %H:%M"), "\n\n\n"))
122 logHeading <- format(logHeading, width=80, justify="c")
124 anovaLog <- paste0("#Preprocessing training population phenotype data.\n\n")
125 anovaLog <- append(anovaLog, paste0("This training population is composed of ", length(allPhenoFiles), " trials. Adjusted means or arithmetic trait values for each trial will be calculated as described below. After that, single trait values for the entries will be calculated by averaging trait adjusted means/arithmetic averages across the trials.\n\n"))
127 for (popPhenoFile in allPhenoFiles) {
129 cnt <- cnt + 1
131 phenoData <- fread(popPhenoFile,
132 header = TRUE,
133 sep="\t",
134 na.strings = c("NA", "", "--", "-", "."))
136 phenoData <- data.frame(phenoData)
138 meansResult <- getAdjMeans(phenoData,
139 traitName = traitAbbr,
140 calcAverages = TRUE,
141 logReturn = TRUE)
143 anovaLog <- append(anovaLog, meansResult$log)
144 phenoTrait <- meansResult$adjMeans
146 keepMetaCols <- c('observationUnitName', 'germplasmName', 'studyDbId', 'locationName',
147 'studyYear', 'replicate', 'blockNumber')
149 trialTraitRawPhenoData <- phenoData %>%
150 select(c(keepMetaCols, traitAbbr))
152 popIdFile <- basename(popPhenoFile)
153 popId <- str_extract(popIdFile, "\\d+")
154 popIds <- c(popIds, popId)
156 newTraitName <- paste(traitAbbr, popId, sep = "_")
157 colnames(phenoTrait)[2] <- newTraitName
159 if (cnt == 1 ) {
160 combinedPhenoPops <- phenoTrait
161 traitRawPhenoData <- trialTraitRawPhenoData
162 } else {
163 combinedPhenoPops <- full_join(combinedPhenoPops, phenoTrait, by='germplasmName')
164 traitRawPhenoData <- bind_rows(traitRawPhenoData, trialTraitRawPhenoData)
168 combinedPhenoPops <- column_to_rownames(combinedPhenoPops, var='germplasmName')
170 # #fill in missing data in combined phenotype dataset
171 # #using row means
172 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
173 combinedPhenoPops <- as.matrix(combinedPhenoPops)
174 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
175 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
177 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
179 combinedPhenoPops$Average<-round(apply(combinedPhenoPops, 1, function(x) { mean(x) }), digits = 2)
181 combinedGenoPops <- c()
183 if (file.size(combinedGenoFile) < 100 ) {
184 combinedGenoPops <- combineGenoData(allGenoFiles)
185 combinedGenoPops$trial <- NULL
186 combinedGenoPops <- combinedGenoPops[order(rownames(combinedGenoPops)), ]
189 message("writing data to files...")
190 #if(length(combinedPhenoFile) != 0 )
192 fwrite(combinedPhenoPops,
193 file = combinedPhenoFile,
194 sep = "\t",
195 quote = FALSE,
196 row.names = TRUE,
199 if (!is.null(traitRawPhenoData) & length(traitRawPhenoFile) != 0) {
201 fwrite(traitRawPhenoData,
202 file = traitRawPhenoFile,
203 row.names = FALSE,
204 sep = "\t",
205 na = 'NA',
206 quote = FALSE,
209 if(!is.null(combinedGenoPops)) {
210 fwrite(combinedGenoPops,
211 file = combinedGenoFile,
212 sep = "\t",
213 quote = FALSE,
214 row.names = TRUE,
218 cat(anovaLog, fill = TRUE, file = analysisReportFile, append=FALSE)
220 q(save = "no", runLast = FALSE)