Merge pull request #4719 from solgenomics/topic/fixing_histogram
[sgn.git] / R / solGS / pca.r
blob5320e747b6c6611bb9e8dab8021ddf0e61c7ef32
1 #SNOPSIS
3 #runs population structure analysis using PCA from SNPRelate, a bioconductor R package
5 #AUTHOR
6 # Isaak Y Tecle (iyt2@cornell.edu)
9 options(echo = FALSE)
11 library(randomForest)
12 library(data.table)
13 library(genoDataFilter)
14 library(tibble)
15 library(dplyr)
16 library(stringr)
17 library(phenoAnalysis)
18 library(ggplot2)
20 allArgs <- commandArgs()
22 outputFile <- grep("output_files", allArgs, value = TRUE)
23 outputFiles <- scan(outputFile, what = "character")
25 inputFile <- grep("input_files", allArgs, value = TRUE)
26 inputFiles <- scan(inputFile, what = "character")
28 scoresFile <- grep("pca_scores", outputFiles, value = TRUE)
29 screeDataFile <- grep("pca_scree_data", outputFiles, value = TRUE)
30 screeFile <- grep("pca_scree_plot", outputFiles, value = TRUE)
31 loadingsFile <- grep("pca_loadings", outputFiles, value = TRUE)
32 varianceFile <- grep("pca_variance", outputFiles, value = TRUE)
33 combinedDataFile <- grep("combined_pca_data_file", outputFiles, value = TRUE)
35 if (is.null(scoresFile))
37 stop("Scores output file is missing.")
38 q("no", 1, FALSE)
41 if (is.null(loadingsFile))
43 stop("Laodings file is missing.")
44 q("no", 1, FALSE)
47 genoData <- c()
48 genoMetaData <- c()
49 filteredGenoFile <- c()
50 phenoData <- c()
52 set.seed(235)
54 pcF <- grepl("genotype", ignore.case=TRUE, inputFiles)
55 dataType <- ifelse(isTRUE(pcF[1]), 'genotype', 'phenotype')
57 if (dataType == 'genotype') {
58 if (length(inputFiles) > 1 ) {
59 allGenoFiles <- inputFiles
60 genoData <- combineGenoData(allGenoFiles)
62 genoMetaData <- genoData$trial
63 genoData$trial <- NULL
65 } else {
66 genoDataFile <- grep("genotype_data", inputFiles, value = TRUE)
67 genoData <- fread(genoDataFile,
68 header = TRUE,
69 na.strings = c("NA", " ", "--", "-", "."))
72 if (is.null(genoData)) {
74 filteredGenoFile <- grep("filtered_genotype_data_",
75 genoDataFile,
76 value = TRUE)
78 if (filteredGenoFile) {
79 genoData <- fread(filteredGenoFile, header = TRUE)
83 genoData <- unique(genoData, by='V1')
84 genoData <- data.frame(genoData)
85 genoData <- column_to_rownames(genoData, 'V1')
88 } else if (dataType == 'phenotype') {
90 metaFile <- grep("meta", inputFiles, value = TRUE)
91 phenoFiles <- grep("phenotype_data", inputFiles, value = TRUE)
93 if (length(phenoFiles) > 1 ) {
95 phenoData <- combinePhenoData(phenoFiles, metaDataFile = metaFile)
96 phenoData <- summarizeTraits(phenoData, groupBy=c('studyDbId', 'germplasmName'))
98 if (all(is.na(phenoData$locationName))) {
99 phenoData$locationName <- 'location'
102 phenoData <- na.omit(phenoData)
103 genoMetaData <- phenoData$studyDbId
105 phenoData <- phenoData %>% mutate(germplasmName = paste0(germplasmName, '_', studyDbId))
106 dropCols = c('replicate', 'blockNumber', 'locationName', 'studyDbId', 'studyYear')
107 phenoData <- phenoData %>% select(-dropCols)
108 rownames(phenoData) <- NULL
109 phenoData <- column_to_rownames(phenoData, var="germplasmName")
110 } else {
111 phenoDataFile <- grep("phenotype_data", inputFiles, value = TRUE)
113 phenoData <- cleanAveragePhenotypes(inputFiles, metaFile)
114 phenoData <- na.omit(phenoData)
117 phenoData <- phenoData[, apply(phenoData, 2, var) != 0 ]
118 phenoData <- scale(phenoData, center=TRUE, scale=TRUE)
119 phenoData <- round(phenoData, 3)
123 if (is.null(genoData) && is.null(phenoData)) {
124 stop("There is no data to run PCA.")
125 q("no", 1, FALSE)
129 genoDataMissing <- c()
130 if (dataType == 'genotype') {
131 # if (is.null(filteredGenoFile) == TRUE) {
132 ##genoDataFilter::filterGenoData
133 genoData <- convertToNumeric(genoData)
134 genoData <- filterGenoData(genoData, maf=0.01)
135 genoData <- roundAlleleDosage(genoData)
137 message("No. of geno missing values, ", sum(is.na(genoData)) )
138 if (sum(is.na(genoData)) > 0) {
139 genoDataMissing <- c('yes')
140 # genoData <- na.roughfix(genoData)
145 ## nCores <- detectCores()
146 ## message('no cores: ', nCores)
147 ## if (nCores > 1) {
148 ## nCores <- (nCores %/% 2)
149 ## } else {
150 ## nCores <- 1
151 ## }
153 pcaData <- c()
154 if (!is.null(genoData)) {
155 pcaData <- genoData
156 genoData <- NULL
157 } else if(!is.null(phenoData)) {
158 pcaData <- phenoData
159 phenoData <- NULL
163 message("No. of missing values, ", sum(is.na(pcaData)) )
164 if (sum(is.na(pcaData)) > 0) {
165 pcaData <- na.roughfix(pcaData)
168 pcsCnt <- ifelse(ncol(pcaData) < 10, ncol(pcaData), 10)
169 pca <- prcomp(pcaData, retx=TRUE)
170 pca <- summary(pca)
172 scores <- data.frame(pca$x)
173 scores <- scores[, 1:pcsCnt]
174 scores <- round(scores, 3)
176 if (!is.null(genoMetaData)) {
177 scores$trial <- genoMetaData
178 scores <- scores %>% select(trial, everything()) %>% data.frame
179 } else {
180 scores$trial <- 1000
181 scores <- scores %>% select(trial, everything()) %>% data.frame
184 scores <- scores[order(row.names(scores)), ]
186 varianceAllPCs <- data.frame(pca$importance)
187 varianceSelectPCs <- varianceAllPCs[2, 1:pcsCnt]
188 varianceSelectPCs <- round(varianceSelectPCs, 4) * 100
189 varianceSelectPCs <- data.frame(t(varianceSelectPCs))
191 colnames(varianceSelectPCs) <- 'Variances'
193 loadings <- data.frame(pca$rotation)
194 loadings <- loadings[, 1:pcsCnt]
195 loadings <- round(loadings, 3)
197 varianceAllPCs <- varianceAllPCs[2, ] %>%
198 t() %>%
199 round(3) * 100
201 colnames(varianceAllPCs) <- 'Variances'
202 varianceAllPCs <- data.frame(varianceAllPCs)
204 varianceAllPCs <- rownames_to_column(varianceAllPCs, 'PCs')
205 varianceAllPCs <- data.frame(varianceAllPCs)
207 screePlot <- ggplot(varianceAllPCs, aes(x = reorder(PCs, -Variances), y = Variances, group = 1)) +
208 geom_line(color="red") +
209 geom_point() +
210 xlab('PCs') +
211 ylab('Explained variance (%)') +
212 theme(axis.text.x = element_text(angle = 90))
214 png(screeFile)
215 screePlot
216 dev.off()
219 fwriteOutput <- function (data, dataFile) {
220 fwrite(data,
221 file = dataFile,
222 sep = "\t",
223 row.names = TRUE,
224 quote = FALSE,
229 fwriteOutput(scores, scoresFile)
230 fwriteOutput(varianceAllPCs, screeDataFile)
231 fwriteOutput(loadings, loadingsFile)
232 fwriteOutput(varianceSelectPCs, varianceFile)
234 if (!is.null(genoData)) {
235 if (length(inputFiles) > 1) {
236 fwriteOutput(genoData, combinedDataFile)
241 # ## if (!is.null(genoDataMissing)) {
242 # ## fwrite(genoData,
243 # ## file = genoDataFile,
244 # ## sep = "\t",
245 # ## row.names = TRUE,
246 # ## quote = FALSE,
247 # ## )
249 # ## }
252 q(save = "no", runLast = FALSE)