Merge branch 'topic/new-blast'
[sgn.git] / R / pca.r
blobf7c0d8e7fd1e84a5f9d2209e82129b9e79440ca0
1 #SNOPSIS
3 #runs population structure analysis using singular values decomposition (SVD)
5 #AUTHOR
6 # Isaak Y Tecle (iyt2@cornell.edu)
9 options(echo = FALSE)
11 library(randomForest)
12 library(irlba)
13 library(data.table)
16 allArgs <- commandArgs()
18 outFile <- grep("output_files",
19 allArgs,
20 ignore.case = TRUE,
21 perl = TRUE,
22 value = TRUE
25 outFiles <- scan(outFile,
26 what = "character"
29 genoDataFile <- grep("genotype_data",
30 allArgs,
31 ignore.case = TRUE,
32 fixed = FALSE,
33 value = TRUE
36 scoresFile <- grep("pca_scores",
37 outFiles,
38 ignore.case = TRUE,
39 fixed = FALSE,
40 value = TRUE
43 loadingsFile <- grep("pca_loadings",
44 outFiles,
45 ignore.case = TRUE,
46 fixed = FALSE,
47 value = TRUE
50 varianceFile <- grep("pca_variance",
51 outFiles,
52 ignore.case = TRUE,
53 fixed = FALSE,
54 value = TRUE
57 message("genotype file: ", genoDataFile)
58 message("pca scores file: ", scoresFile)
59 message("pca loadings file: ", loadingsFile)
60 message("pca variance file: ", varianceFile)
62 if (is.null(genoDataFile))
64 stop("genotype dataset missing.")
65 q("no", 1, FALSE)
68 if (is.null(scoresFile))
70 stop("Scores output file is missing.")
71 q("no", 1, FALSE)
74 if (is.null(loadingsFile))
76 stop("Laodings file is missing.")
77 q("no", 1, FALSE)
80 genoData <- as.data.frame(fread(genoDataFile,
81 na.strings = c("NA", " ", "--", "-", "."),
84 rownames(genoData) <- genoData[, 1]
85 genoData[, 1] <- NULL
86 str(genoData)
88 genoData[1:2, 1:5]
90 message("No. of geno missing values, ", sum(is.na(genoData)) )
91 ncol(genoData)
92 genoData <- genoData[, colSums(is.na(genoData)) < nrow(genoData) * 0.5]
93 ncol(genoData)
94 #genoData <- as.data.frame(round(genoData, digits=0))
95 #str(genoData)
96 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
97 #genoTrCode <- grep("2", genoData[1, ], fixed=TRUE, value=TRUE)
98 #if(length(genoTrCode) != 0) {
99 # genoData <- genoData - 1
102 message("No. of geno missing values, ", sum(is.na(genoData)) )
103 genoDataMissing <- c()
104 if (sum(is.na(genoData)) > 0) {
105 genoDataMissing <- c('yes')
106 genoData <- na.roughfix(genoData)
110 ######
111 genotypes <- rownames(genoData)
112 svdOut <- irlba(scale(genoData, TRUE, FALSE), nu=10, nv=10)
113 scores <- round(svdOut$u %*% diag(svdOut$d), digits=2)
114 loadings <- round(svdOut$v, digits=5)
115 totalVar <- sum(svdOut$d)
116 variances <- unlist(
117 lapply(svdOut$d,
118 function(x)
119 round((x / totalVar)*100, digits=2)
123 variances <- data.frame(variances)
124 scores <- data.frame(scores)
125 rownames(scores) <- genotypes
127 headers <- c()
128 for (i in 1:10) {
129 headers[i] <- paste("PC", i, sep='')
132 colnames(scores) <- c(headers)
134 write.table(scores,
135 file = scoresFile,
136 sep = "\t",
137 col.names = NA,
138 quote = FALSE,
139 append = FALSE
142 write.table(loadings,
143 file = loadingsFile,
144 sep = "\t",
145 col.names = NA,
146 quote = FALSE,
147 append = FALSE
150 write.table(variances,
151 file = varianceFile,
152 sep = "\t",
153 col.names = NA,
154 quote = FALSE,
155 append = FALSE
159 if (!is.null(genoDataMissing)) {
160 write.table(genoData,
161 file = genoDataFile,
162 sep = "\t",
163 col.names = NA,
164 quote = FALSE,
169 q(save = "no", runLast = FALSE)