Merge pull request #4719 from solgenomics/topic/fixing_histogram
[sgn.git] / R / solGS / combine_genotype_data.r
blobbdf0bd2e9b87bad37410ecdb25d345d62aa560ad
1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
3 #populations
5 options(echo = FALSE)
7 library(stringr)
8 library(randomForest)
9 library(lme4)
10 library(data.table)
11 library(phenoAnalysis)
12 library(dplyr)
13 library(tibble)
15 allArgs <- commandArgs()
17 inFile <- grep("input_files",
18 allArgs,
19 ignore.case = TRUE,
20 perl = TRUE,
21 value = TRUE
24 outFile <- grep("output_files",
25 allArgs,
26 ignore.case = TRUE,
27 perl = TRUE,
28 value = TRUE
31 outFiles <- scan(outFile,
32 what = "character"
35 combinedGenoFile <- grep("genotype_data",
36 outFiles,
37 ignore.case = TRUE,
38 fixed = FALSE,
39 value = TRUE
42 combinedPhenoFile <- grep("phenotype_data",
43 outFiles,
44 ignore.case = TRUE,
45 fixed = FALSE,
46 value = TRUE
49 inFiles <- scan(inFile,
50 what = "character"
52 print(inFiles)
54 traitFile <- grep("trait_",
55 inFiles,
56 ignore.case = TRUE,
57 fixed = FALSE,
58 value = TRUE
61 trait <- scan(traitFile,
62 what = "character",
65 traitInfo <- strsplit(trait, "\t");
66 traitId <- traitInfo[[1]]
67 traitName <- traitInfo[[2]]
69 #extract trait phenotype data from all populations
70 #and combine them into one dataset
72 allPhenoFiles <- grep("phenotype_data",
73 inFiles,
74 ignore.case = TRUE,
75 fixed = FALSE,
76 value = TRUE
78 message("phenotype files: ", allPhenoFiles)
80 allGenoFiles <- grep("genotype_data",
81 inFiles,
82 ignore.case = TRUE,
83 fixed = FALSE,
84 value = TRUE
87 popIds <- c()
88 combinedPhenoPops <- c()
89 cnt <- 0
91 for (popPhenoFile in allPhenoFiles) {
93 cnt <- cnt + 1
95 phenoData <- fread(popPhenoFile,
96 na.strings = c("NA", " ", "--", "-", "."))
98 phenoData <- data.frame(phenoData)
100 phenoTrait <- getAdjMeans(phenoData,
101 traitName=traitName,
102 calcAverages=TRUE)
104 popIdFile <- basename(popPhenoFile)
105 popId <- str_extract(popIdFile, "\\d+")
106 popIds <- c(popIds, popId)
108 newTraitName <- paste(traitName, popId, sep = "_")
109 colnames(phenoTrait)[2] <- newTraitName
111 if (cnt == 1 ) {
112 print('no need to combine, yet')
113 combinedPhenoPops <- phenoTrait
115 } else {
116 print('combining...phenotypes')
118 combinedPhenoPops <- full_join(combinedPhenoPops, phenoTrait, by='genotypes')
122 combinedPhenoPops <- column_to_rownames(combinedPhenoPops, var='genotypes')
124 # #fill in missing data in combined phenotype dataset
125 # #using row means
126 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
127 combinedPhenoPops <- as.matrix(combinedPhenoPops)
128 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
129 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
131 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
133 combinedPhenoPops$Average<-round(apply(combinedPhenoPops,
135 function(x)
136 { mean(x) }
138 digits = 2
141 markersList <- c()
142 combinedGenoPops <- c()
143 cnt <- 0
145 for (popGenoFile in allGenoFiles) {
147 uniqGenoNames <- c()
148 cnt <- cnt + 1
150 genoData <- fread(popGenoFile,
151 na.strings = c("NA", " ", "--", "-"),
154 genoData <- data.frame(genoData)
155 message('cnt of genotypes in dataset: ', length(rownames(genoData)))
156 genoData <- genoData[!duplicated(genoData[,'V1']), ]
157 message('cnt of unique genotypes in dataset: ', length(rownames(genoData)))
158 rownames(genoData) <- genoData[, 1]
159 genoData[, 1] <- NULL
161 popGenoFile <- basename(popGenoFile)
162 popId <- str_extract(popGenoFile, "\\d+")
163 popIds <- c(popIds, popId)
165 #popMarkers <- colnames(genoData)
166 #message("No of markers from population ", popId, ": ", length(popMarkers))
168 # message("sum of geno missing values: ", sum(is.na(genoData)))
169 # genoData <- genoData[, colSums(is.na(genoData)) < nrow(genoData) * 0.5]
170 # genoData <- data.frame(genoData)
171 # message("sum of geno missing values: ", sum(is.na(genoData)))
172 ## if (sum(is.na(genoData)) > 0) {
173 ## message("sum of geno missing values: ", sum(is.na(genoData)))
174 ## genoData <- na.roughfix(genoData)
175 ## message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
176 ## }
178 if (cnt == 1 ) {
179 print('no need to combine, yet')
180 message('cnt of genotypes first dataset: ', length(rownames(genoData)))
181 combinedGenoPops <- genoData
183 } else {
184 print('combining genotype datasets...')
186 uniqGenoNames <- unique(rownames(combinedGenoPops))
188 message('cnt of genotypes in new dataset ', popId, ': ', length(rownames(genoData)) )
190 genoData <- genoData[!(rownames(genoData) %in% uniqGenoNames),]
192 message('cnt of unique genotypes from new dataset ', popId, ': ', length(rownames(genoData)))
194 if (!is.null(genoData)) {
195 combinedGenoPops <- rbind(combinedGenoPops, genoData)
196 } else {
197 message('dataset ', popId, ' has no unique genotypes.')
203 combinedGenoPops <- combinedGenoPops[order(rownames(combinedGenoPops)), ]
204 message("combined number of genotypes in combined dataset: ", length(rownames(combinedGenoPops)))
206 message("writing data to files...")
207 #if(length(combinedPhenoFile) != 0 )
209 fwrite(combinedPhenoPops,
210 file = combinedPhenoFile,
211 sep = "\t",
212 quote = FALSE,
213 row.names = TRUE,
217 #if(length(combinedGenoFile) != 0 )
219 fwrite(combinedGenoPops,
220 file = combinedGenoFile,
221 sep = "\t",
222 quote = FALSE,
223 row.names = TRUE,
227 q(save = "no", runLast = FALSE)