seedlot upload with accession synonyms. seedlot upload works to update existing seedlots
[sgn.git] / R / solGS / combine_populations.r
blob8cee408dafb88ad88ab402c7bc492f1fa5afe52c
1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
3 #populations
5 options(echo = FALSE)
7 library(stats)
8 library(stringr)
9 library(randomForest)
10 library(plyr)
11 library(lme4)
12 library(data.table)
13 library(phenoAnalysis)
15 allArgs <- commandArgs()
17 inFile <- grep("input_files",
18 allArgs,
19 ignore.case = TRUE,
20 perl = TRUE,
21 value = TRUE
24 outFile <- grep("output_files",
25 allArgs,
26 ignore.case = TRUE,
27 perl = TRUE,
28 value = TRUE
31 outFiles <- scan(outFile,
32 what = "character"
35 combinedGenoFile <- grep("genotype_data",
36 outFiles,
37 ignore.case = TRUE,
38 fixed = FALSE,
39 value = TRUE
42 combinedPhenoFile <- grep("phenotype_data",
43 outFiles,
44 ignore.case = TRUE,
45 fixed = FALSE,
46 value = TRUE
49 inFiles <- scan(inFile,
50 what = "character"
52 print(inFiles)
54 traitFile <- grep("trait_",
55 inFiles,
56 ignore.case = TRUE,
57 fixed = FALSE,
58 value = TRUE
61 trait <- scan(traitFile,
62 what = "character",
65 traitInfo<-strsplit(trait, "\t");
66 traitId<-traitInfo[[1]]
67 traitName<-traitInfo[[2]]
69 #extract trait phenotype data from all populations
70 #and combine them into one dataset
72 allPhenoFiles <- grep("phenotype_data",
73 inFiles,
74 ignore.case = TRUE,
75 fixed = FALSE,
76 value = TRUE
78 message("phenotype files: ", allPhenoFiles)
80 allGenoFiles <- grep("genotype_data",
81 inFiles,
82 ignore.case = TRUE,
83 fixed = FALSE,
84 value = TRUE
87 popsPhenoSize <- length(allPhenoFiles)
88 popsGenoSize <- length(allGenoFiles)
89 popIds <- c()
90 combinedPhenoPops <- c()
92 for (popPhenoNum in 1:popsPhenoSize) {
93 popId <- str_extract_all(allPhenoFiles[[popPhenoNum]], "\\d+")
95 popId <- popId[[1]][2]
96 popIds <- c(popIds, popId)
98 phenoData <- fread(allPhenoFiles[[popPhenoNum]],
99 na.strings = c("NA", " ", "--", "-", "."),
101 phenoData <- data.frame(phenoData)
103 phenoTrait <- getAdjMeans(phenoData, traitName)
105 newTraitName <- paste(traitName, popId, sep = "_")
106 colnames(phenoTrait)[2] <- newTraitName
108 if (popPhenoNum == 1 )
110 print('no need to combine, yet')
111 combinedPhenoPops <- phenoTrait
113 } else {
114 print('combining...')
116 combinedPhenoPops <- merge(combinedPhenoPops, phenoTrait, all=TRUE)
117 rownames(combinedPhenoPops) <- combinedPhenoPops[, 1]
118 combinedPhenoPops[, 1] <- NULL
123 #fill in missing data in combined phenotype dataset
124 #using row means
125 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
126 combinedPhenoPops <- as.matrix(combinedPhenoPops)
127 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
128 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
130 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
132 combinedPhenoPops$Average<-round(apply(combinedPhenoPops,
134 function(x)
135 { mean(x) }
137 digits = 2
140 markersList <- c()
141 combinedGenoPops <- c()
143 for (popGenoNum in 1:popsGenoSize)
145 popId <- str_extract(allGenoFiles[[popGenoNum]], "\\d+")
146 popIds <- append(popIds, popId)
148 genoData <- fread(allGenoFiles[[popGenoNum]],
149 na.strings = c("NA", " ", "--", "-"),
152 genoData <- as.data.frame(genoData)
153 rownames(genoData) <- genoData[, 1]
154 genoData[, 1] <- NULL
156 popMarkers <- colnames(genoData)
157 message("No of markers from population ", popId, ": ", length(popMarkers))
159 message("sum of geno missing values: ", sum(is.na(genoData)))
160 genoData <- genoData[, colSums(is.na(genoData)) < nrow(genoData) * 0.5]
161 message("sum of geno missing values: ", sum(is.na(genoData)))
163 if (sum(is.na(genoData)) > 0)
165 message("sum of geno missing values: ", sum(is.na(genoData)))
166 genoData <- na.roughfix(genoData)
167 message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
170 if (popGenoNum == 1 )
172 print('no need to combine, yet')
173 combinedGenoPops <- genoData
175 } else {
176 print('combining genotype datasets...')
177 combinedGenoPops <-rbind(combinedGenoPops, genoData)
183 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
184 #discard duplicate clones
185 combinedGenoPops <- unique(combinedGenoPops)
186 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
188 message("writing data into files...")
189 #if(length(combinedPhenoFile) != 0 )
191 fwrite(combinedPhenoPops,
192 file = combinedPhenoFile,
193 sep = "\t",
194 quote = FALSE,
195 row.names = TRUE,
199 #if(length(combinedGenoFile) != 0 )
201 fwrite(combinedGenoPops,
202 file = combinedGenoFile,
203 sep = "\t",
204 quote = FALSE,
205 row.names = TRUE,
209 q(save = "no", runLast = FALSE)