Merge pull request #4719 from solgenomics/topic/fixing_histogram
[sgn.git] / R / solGS / kclustering.r
blob7dabb3a22ffec8b3c931e1467e51e2c2b73e93b2
1 # SNOPSIS
3 # runs k-means cluster analysis
5 # AUTHOR Isaak Y Tecle (iyt2@cornell.edu)
8 options(echo = FALSE)
10 library(randomForest)
11 library(data.table)
12 library(genoDataFilter)
13 library(tibble)
14 library(dplyr)
15 library(fpc)
16 library(cluster)
17 library(ggfortify)
18 library(tibble)
19 library(stringi)
20 library(phenoAnalysis)
21 library(factoextra)
23 allArgs <- commandArgs()
25 outputFiles <- grep("output_files", allArgs, value = TRUE)
26 outputFiles <- scan(outputFiles, what = "character")
28 inputFiles <- grep("input_files", allArgs, value = TRUE)
29 inputFiles <- scan(inputFiles, what = "character")
31 optionsFile <- grep("options", inputFiles, value = TRUE)
32 clusterOptions <- read.table(optionsFile, header = TRUE, sep = "\t", stringsAsFactors = FALSE,
33 na.strings = "")
34 print(clusterOptions)
36 kmeansPlotFile <- grep("k-means_plot", outputFiles, value = TRUE)
37 kResultFile <- grep("result", outputFiles, value = TRUE)
38 elbowPlotFile <- grep("elbow_plot", outputFiles, value = TRUE)
39 clusterMeansFile <- grep("k-means_means", outputFiles, value = TRUE)
40 variancesFile <- grep("k-means_variances", outputFiles, value = TRUE)
41 reportFile <- grep("report", outputFiles, value = TRUE)
42 errorFile <- grep("error", outputFiles, value = TRUE)
44 combinedDataFile <- grep("combined_cluster_data_file", outputFiles, value = TRUE)
46 clusterOptions <- column_to_rownames(clusterOptions, var = "Params")
47 userKNumbers <- as.numeric(clusterOptions["k_numbers", 1])
48 dataType <- clusterOptions["data_type", 1]
49 selectionProp <- as.numeric(clusterOptions["selection_proportion", 1])
50 predictedTraits <- clusterOptions["predicted_traits", 1]
51 predictedTraits <- unlist(strsplit(predictedTraits, ","))
53 if (is.null(kResultFile)) {
54 stop("Clustering output file is missing.")
55 q("no", 1, FALSE)
58 clusterData <- c()
59 genoData <- c()
60 genoFiles <- c()
61 reportNotes <- c()
62 genoDataMissing <- c()
64 extractGenotype <- function(inputFiles) {
66 genoFiles <- grep("genotype_data", inputFiles, value = TRUE)
68 genoMetaData <- c()
69 filteredGenoFile <- c()
71 if (length(genoFiles) > 1) {
72 genoData <- combineGenoData(genoFiles)
74 genoMetaData <- genoData$trial
75 genoData$trial <- NULL
77 } else {
78 genoFile <- genoFiles
79 genoData <- fread(genoFile, header = TRUE, na.strings = c("NA", " ", "--",
80 "-", "."))
82 if (is.null(genoData)) {
83 filteredGenoFile <- grep("filtered_genotype_data_", genoFile, value = TRUE)
84 genoData <- fread(filteredGenoFile, header = TRUE)
87 genoData <- unique(genoData, by = "V1")
88 genoData <- data.frame(genoData)
89 genoData <- column_to_rownames(genoData, "V1")
92 if (is.null(genoData)) {
93 stop("There is no genotype dataset.")
94 q("no", 1, FALSE)
95 } else {
97 ## genoDataFilter::filterGenoData
98 genoData <- convertToNumeric(genoData)
99 genoData <- filterGenoData(genoData, maf = 0.01)
100 genoData <- roundAlleleDosage(genoData)
102 message("No. of geno missing values, ", sum(is.na(genoData)))
103 if (sum(is.na(genoData)) > 0) {
104 genoDataMissing <- c("yes")
105 genoData <- na.roughfix(genoData)
108 genoData <- data.frame(genoData)
112 set.seed(235)
114 clusterDataNotScaled <- c()
116 if (grepl("genotype", dataType, ignore.case = TRUE)) {
117 clusterData <- extractGenotype(inputFiles)
119 pca <- prcomp(clusterData, retx = TRUE)
120 pca <- summary(pca)
122 variances <- data.frame(pca$importance)
124 varProp <- variances[3, ]
125 varProp <- data.frame(t(varProp))
126 names(varProp) <- c("cumVar")
128 selectPcs <- varProp %>%
129 filter(cumVar <= 0.9)
130 pcsCnt <- nrow(selectPcs)
132 reportNotes <- paste0("Before clustering this dataset, principal component analysis (PCA) was perforemd on it to reduce the number of variables (dimensions). ")
133 reportNotes <- paste0(reportNotes, "Based on the PCA, ", pcsCnt, " PCs were used to do the clustering. ")
134 reportNotes <- paste0(reportNotes, "\n\nThe ", pcsCnt, " PCs explain 90% of the variance in the original dataset.",
135 "\n")
137 scores <- data.frame(pca$x)
138 scores <- scores[, 1:pcsCnt]
139 scores <- round(scores, 3)
141 variances <- variances[2, 1:pcsCnt]
142 variances <- round(variances, 4) * 100
143 variances <- data.frame(t(variances))
145 clusterData <- scores
146 } else {
148 if (grepl("gebv", dataType, ignore.case = TRUE)) {
149 gebvsFile <- grep("combined_gebvs", inputFiles, value = TRUE)
150 gebvsData <- data.frame(fread(gebvsFile, header = TRUE))
152 clusterNa <- gebvsData %>%
153 filter_all(any_vars(is.na(.)))
154 clusterData <- column_to_rownames(gebvsData, "V1")
155 } else if (grepl("phenotype", dataType, ignore.case = TRUE)) {
157 metaFile <- grep("meta", inputFiles, value = TRUE)
159 clusterData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
161 if (!is.na(predictedTraits) && length(predictedTraits) > 1) {
162 clusterData <- rownames_to_column(clusterData, var = "germplasmName")
163 clusterData <- clusterData %>%
164 select(c(germplasmName, predictedTraits))
165 clusterData <- column_to_rownames(clusterData, var = "germplasmName")
169 clusterData <- clusterData[, apply(clusterData, 2, function(x) var(x, na.rm=TRUE)) != 0]
170 clusterDataNotScaled <- na.omit(clusterData)
171 clusterData <- scale(clusterDataNotScaled, center = TRUE, scale = TRUE)
172 clusterData <- round(clusterData, 3)
173 reportNotes <- paste0(reportNotes, "Note: Data was standardized before clustering.",
174 "\n")
176 sIndexFile <- grep("selection_index", inputFiles, value = TRUE)
177 selectedIndexGenotypes <- c()
179 if (length(sIndexFile) != 0) {
180 sIndexData <- data.frame(fread(sIndexFile, header = TRUE))
181 selectionProp <- selectionProp * 0.01
182 selectedIndexGenotypes <- sIndexData %>%
183 top_frac(selectionProp)
185 selectedIndexGenotypes <- column_to_rownames(selectedIndexGenotypes, var = "V1")
187 if (!is.null(selectedIndexGenotypes)) {
188 clusterData <- rownames_to_column(clusterData, var = "germplasmName")
189 clusterData <- clusterData %>%
190 filter(germplasmName %in% rownames(selectedIndexGenotypes))
192 clusterData <- column_to_rownames(clusterData, var = "germplasmName")
196 kMeansOut <- kmeansruns(clusterData, runs = 10)
197 kCenters <- kMeansOut$bestk
199 if (!is.na(userKNumbers)) {
200 if (userKNumbers != 0) {
201 kCenters <- as.integer(userKNumbers)
202 reportNotes <- paste0(reportNotes, "\n\nThe data was partitioned into ", userKNumbers,
203 " clusters.\n")
207 reportNotes <- paste0(reportNotes, "\n\nAccording the kmeansruns algorithm from the fpc R package, the recommended number of clusters (k) for this data set is: ",
208 kCenters, "\n\nYou can also check the Elbow plot to evaluate how many clusters may be better suited for your purpose.")
210 kMeansOut <- kmeans(clusterData, centers = kCenters, nstart = 10)
211 kClusters <- data.frame(kMeansOut$cluster)
212 kClusters <- rownames_to_column(kClusters)
213 names(kClusters) <- c("germplasmName", "Cluster")
215 if (!is.null(clusterDataNotScaled)) {
216 clusterDataNotScaled <- rownames_to_column(clusterDataNotScaled, var = "germplasmName")
218 clusteredData <- inner_join(kClusters, clusterDataNotScaled, by = "germplasmName")
219 } else if (!is.null(selectedIndexGenotypes)) {
220 selectedIndexGenotypes <- rownames_to_column(selectedIndexGenotypes, var = "germplasmName")
221 clusteredData <- inner_join(kClusters, selectedIndexGenotypes, by = "germplasmName")
222 } else {
223 clusteredData <- kClusters
226 clusteredData <- clusteredData %>%
227 mutate_if(is.double, round, 2) %>%
228 arrange(Cluster)
230 # print(paste('size: ', '\n', '\n', round(kMeansOut$size, 2)))
231 # print(paste('centers: ','\n', round(kMeansOut$centers, 2)))
233 if (length(elbowPlotFile) && !file.info(elbowPlotFile)$size) {
234 message("running elbow method...")
235 png(elbowPlotFile)
236 print(fviz_nbclust(clusterData, k.max = 20, FUNcluster = kmeans, method = "wss"))
237 dev.off()
240 png(kmeansPlotFile)
241 ggplot2::autoplot(kMeansOut, data = clusterData, frame = TRUE, x = 1, y = 2)
242 # fviz_cluster(kMeansOut, geom = "point", main = "", data = clusterData)
243 dev.off()
245 clusterMeans <- c()
246 if (!grepl('genotype', kResultFile)) {
247 message('adding cluster means to clusters...')
248 clusterMeans <- aggregate(clusterDataNotScaled, by = list(cluster = kMeansOut$cluster),
249 mean)
251 clusterMeans <- clusterMeans %>%
252 select(-germplasmName) %>%
253 mutate_if(is.double, round, 2)
257 cat(reportNotes, file = reportFile, sep = "\n", append = TRUE)
259 variances <- paste0("Variances output: ")
260 variances <- append(variances, (paste0("\nBetween clusters sum of squares (betweenss): ",
261 "\t", round(kMeansOut$betweenss, 2))))
262 withinss <- round(kMeansOut$withinss, 2)
263 withinss <- c(paste0("\nWithin clusters sum of squares (withinss): "), "\t", paste("\n\t",
264 withinss))
265 variances <- append(variances, withinss)
266 variances <- append(variances, (paste0("\nTotal within cluster sum of squares (tot.withinss): ",
267 "\t", round(kMeansOut$tot.withinss, 2))))
269 variances <- append(variances, (paste0("\nTotal sum of squares (totss): ", "\t",
270 round(kMeansOut$totss, 2))))
272 cat(variances, file = variancesFile, sep = "", append = TRUE)
274 if (length(genoFiles) > 1) {
275 fwrite(genoData, file = combinedDataFile, sep = "\t", row.names = TRUE, quote = FALSE,
279 if (length(kResultFile)) {
280 fwrite(clusteredData, file = kResultFile, sep = "\t", row.names = FALSE, quote = FALSE,
284 if (length(clusterMeansFile) && !is.null(clusterMeans)) {
285 fwrite(clusterMeans, file = clusterMeansFile, sep = "\t", row.names = FALSE,
286 quote = FALSE, )
289 message("Done clustering.")
292 ####
293 q(save = "no", runLast = FALSE)
294 ####