3 #runs population structure analysis using PCA from SNPRelate, a bioconductor R package
6 # Isaak Y Tecle (iyt2@cornell.edu)
13 library(genoDataFilter
)
17 library(phenoAnalysis
)
20 allArgs
<- commandArgs()
22 outputFile
<- grep("output_files", allArgs
, value
= TRUE)
23 outputFiles
<- scan(outputFile
, what
= "character")
25 inputFile
<- grep("input_files", allArgs
, value
= TRUE)
26 inputFiles
<- scan(inputFile
, what
= "character")
28 scoresFile
<- grep("pca_scores", outputFiles
, value
= TRUE)
29 screeDataFile
<- grep("pca_scree_data", outputFiles
, value
= TRUE)
30 screeFile
<- grep("pca_scree_plot", outputFiles
, value
= TRUE)
31 loadingsFile
<- grep("pca_loadings", outputFiles
, value
= TRUE)
32 varianceFile
<- grep("pca_variance", outputFiles
, value
= TRUE)
33 combinedDataFile
<- grep("combined_pca_data_file", outputFiles
, value
= TRUE)
35 if (is
.null(scoresFile
))
37 stop("Scores output file is missing.")
41 if (is
.null(loadingsFile
))
43 stop("Laodings file is missing.")
49 filteredGenoFile
<- c()
54 pcF
<- grepl("genotype", ignore
.case
=TRUE, inputFiles
)
55 dataType
<- ifelse(isTRUE(pcF
[1]), 'genotype', 'phenotype')
57 if (dataType
== 'genotype') {
58 if (length(inputFiles
) > 1 ) {
59 allGenoFiles
<- inputFiles
60 genoData
<- combineGenoData(allGenoFiles
)
62 genoMetaData
<- genoData$trial
63 genoData$trial
<- NULL
66 genoDataFile
<- grep("genotype_data", inputFiles
, value
= TRUE)
67 genoData
<- fread(genoDataFile
,
69 na
.strings
= c("NA", " ", "--", "-", "."))
72 if (is
.null(genoData
)) {
74 filteredGenoFile
<- grep("filtered_genotype_data_",
78 if (filteredGenoFile
) {
79 genoData
<- fread(filteredGenoFile
, header
= TRUE)
83 genoData
<- unique(genoData
, by
='V1')
84 genoData
<- data
.frame(genoData
)
85 genoData
<- column_to_rownames(genoData
, 'V1')
88 } else if (dataType
== 'phenotype') {
90 metaFile
<- grep("meta", inputFiles
, value
= TRUE)
91 phenoFiles
<- grep("phenotype_data", inputFiles
, value
= TRUE)
93 if (length(phenoFiles
) > 1 ) {
95 phenoData
<- combinePhenoData(phenoFiles
, metaDataFile
= metaFile
)
96 phenoData
<- summarizeTraits(phenoData
, groupBy
=c('studyDbId', 'germplasmName'))
98 if (all(is
.na(phenoData$locationName
))) {
99 phenoData$locationName
<- 'location'
102 phenoData
<- na
.omit(phenoData
)
103 genoMetaData
<- phenoData$studyDbId
105 phenoData
<- phenoData
%>% mutate(germplasmName
= paste0(germplasmName
, '_', studyDbId
))
106 dropCols
= c('replicate', 'blockNumber', 'locationName', 'studyDbId', 'studyYear')
107 phenoData
<- phenoData
%>% select(-dropCols
)
108 rownames(phenoData
) <- NULL
109 phenoData
<- column_to_rownames(phenoData
, var
="germplasmName")
111 phenoDataFile
<- grep("phenotype_data", inputFiles
, value
= TRUE)
113 phenoData
<- cleanAveragePhenotypes(inputFiles
, metaFile
)
114 phenoData
<- na
.omit(phenoData
)
117 phenoData
<- phenoData
[, apply(phenoData
, 2, var
) != 0 ]
118 phenoData
<- scale(phenoData
, center
=TRUE, scale
=TRUE)
119 phenoData
<- round(phenoData
, 3)
123 if (is
.null(genoData
) && is
.null(phenoData
)) {
124 stop("There is no data to run PCA.")
129 genoDataMissing
<- c()
130 if (dataType
== 'genotype') {
131 # if (is.null(filteredGenoFile) == TRUE) {
132 ##genoDataFilter::filterGenoData
133 genoData
<- convertToNumeric(genoData
)
134 genoData
<- filterGenoData(genoData
, maf
=0.01)
135 genoData
<- roundAlleleDosage(genoData
)
137 message("No. of geno missing values, ", sum(is
.na(genoData
)) )
138 if (sum(is
.na(genoData
)) > 0) {
139 genoDataMissing
<- c('yes')
140 # genoData <- na.roughfix(genoData)
145 ## nCores <- detectCores()
146 ## message('no cores: ', nCores)
148 ## nCores <- (nCores %/% 2)
154 if (!is
.null(genoData
)) {
157 } else if(!is
.null(phenoData
)) {
163 message("No. of missing values, ", sum(is
.na(pcaData
)) )
164 if (sum(is
.na(pcaData
)) > 0) {
165 pcaData
<- na
.roughfix(pcaData
)
168 pcsCnt
<- ifelse(ncol(pcaData
) < 10, ncol(pcaData
), 10)
169 pca
<- prcomp(pcaData
, retx
=TRUE)
172 scores
<- data
.frame(pca$x
)
173 scores
<- scores
[, 1:pcsCnt
]
174 scores
<- round(scores
, 3)
176 if (!is
.null(genoMetaData
)) {
177 scores$trial
<- genoMetaData
178 scores
<- scores
%>% select(trial
, everything()) %>% data
.frame
181 scores
<- scores
%>% select(trial
, everything()) %>% data
.frame
184 scores
<- scores
[order(row
.names(scores
)), ]
186 varianceAllPCs
<- data
.frame(pca$importance
)
187 varianceSelectPCs
<- varianceAllPCs
[2, 1:pcsCnt
]
188 varianceSelectPCs
<- round(varianceSelectPCs
, 4) * 100
189 varianceSelectPCs
<- data
.frame(t(varianceSelectPCs
))
191 colnames(varianceSelectPCs
) <- 'Variances'
193 loadings
<- data
.frame(pca$rotation
)
194 loadings
<- loadings
[, 1:pcsCnt
]
195 loadings
<- round(loadings
, 3)
197 varianceAllPCs
<- varianceAllPCs
[2, ] %>%
201 colnames(varianceAllPCs
) <- 'Variances'
202 varianceAllPCs
<- data
.frame(varianceAllPCs
)
204 varianceAllPCs
<- rownames_to_column(varianceAllPCs
, 'PCs')
205 varianceAllPCs
<- data
.frame(varianceAllPCs
)
207 screePlot
<- ggplot(varianceAllPCs
, aes(x
= reorder(PCs
, -Variances
), y
= Variances
, group
= 1)) +
208 geom_line(color
="red") +
211 ylab('Explained variance (%)') +
212 theme(axis
.text
.x
= element_text(angle
= 90))
219 fwriteOutput
<- function (data
, dataFile
) {
229 fwriteOutput(scores
, scoresFile
)
230 fwriteOutput(varianceAllPCs
, screeDataFile
)
231 fwriteOutput(loadings
, loadingsFile
)
232 fwriteOutput(varianceSelectPCs
, varianceFile
)
234 if (!is
.null(genoData
)) {
235 if (length(inputFiles
) > 1) {
236 fwriteOutput(genoData
, combinedDataFile
)
241 # ## if (!is.null(genoDataMissing)) {
242 # ## fwrite(genoData,
243 # ## file = genoDataFile,
245 # ## row.names = TRUE,
252 q(save
= "no", runLast
= FALSE)