R/solGS/kclustering.r

   1 # SNOPSIS
   2
   3 # runs k-means cluster analysis
   4
   5 # AUTHOR Isaak Y Tecle (iyt2@cornell.edu)
   6
   7
   8 options(echo = FALSE)
   9
  10 library(randomForest)
  11 library(data.table)
  12 library(genoDataFilter)
  13 library(tibble)
  14 library(dplyr)
  15 library(fpc)
  16 library(cluster)
  17 library(ggfortify)
  18 library(tibble)
  19 library(stringi)
  20 library(phenoAnalysis)
  21 library(factoextra)
  22
  23 allArgs <- commandArgs()
  24
  25 outputFiles <- grep("output_files", allArgs, value = TRUE)
  26 outputFiles <- scan(outputFiles, what = "character")
  27
  28 inputFiles <- grep("input_files", allArgs, value = TRUE)
  29 inputFiles <- scan(inputFiles, what = "character")
  30
  31 optionsFile <- grep("options", inputFiles, value = TRUE)
  32 clusterOptions <- read.table(optionsFile, header = TRUE, sep = "\t", stringsAsFactors = FALSE,
  33     na.strings = "")
  34 print(clusterOptions)
  35
  36 kmeansPlotFile <- grep("k-means_plot", outputFiles, value = TRUE)
  37 kResultFile <- grep("result", outputFiles, value = TRUE)
  38 elbowPlotFile <- grep("elbow_plot", outputFiles, value = TRUE)
  39 clusterMeansFile <- grep("k-means_means", outputFiles, value = TRUE)
  40 variancesFile <- grep("k-means_variances", outputFiles, value = TRUE)
  41 reportFile <- grep("report", outputFiles, value = TRUE)
  42 errorFile <- grep("error", outputFiles, value = TRUE)
  43
  44 combinedDataFile <- grep("combined_cluster_data_file", outputFiles, value = TRUE)
  45
  46 clusterOptions <- column_to_rownames(clusterOptions, var = "Params")
  47 userKNumbers <- as.numeric(clusterOptions["k_numbers", 1])
  48 dataType <- clusterOptions["data_type", 1]
  49 selectionProp <- as.numeric(clusterOptions["selection_proportion", 1])
  50 predictedTraits <- clusterOptions["predicted_traits", 1]
  51 predictedTraits <- unlist(strsplit(predictedTraits, ","))
  52
  53 if (is.null(kResultFile)) {
  54     stop("Clustering output file is missing.")
  55     q("no", 1, FALSE)
  56 }
  57
  58 clusterData <- c()
  59 genoData <- c()
  60 genoFiles <- c()
  61 reportNotes <- c()
  62 genoDataMissing <- c()
  63
  64 extractGenotype <- function(inputFiles) {
  65
  66     genoFiles <- grep("genotype_data", inputFiles, value = TRUE)
  67
  68     genoMetaData <- c()
  69     filteredGenoFile <- c()
  70
  71     if (length(genoFiles) > 1) {
  72         genoData <- combineGenoData(genoFiles)
  73
  74         genoMetaData <- genoData$trial
  75         genoData$trial <- NULL
  76
  77     } else {
  78         genoFile <- genoFiles
  79         genoData <- fread(genoFile, header = TRUE, na.strings = c("NA", " ", "--",
  80             "-", "."))
  81
  82         if (is.null(genoData)) {
  83             filteredGenoFile <- grep("filtered_genotype_data_", genoFile, value = TRUE)
  84             genoData <- fread(filteredGenoFile, header = TRUE)
  85         }
  86
  87         genoData <- unique(genoData, by = "V1")
  88         genoData <- data.frame(genoData)
  89         genoData <- column_to_rownames(genoData, "V1")
  90     }
  91
  92     if (is.null(genoData)) {
  93         stop("There is no genotype dataset.")
  94         q("no", 1, FALSE)
  95     } else {
  96
  97         ## genoDataFilter::filterGenoData
  98         genoData <- convertToNumeric(genoData)
  99         genoData <- filterGenoData(genoData, maf = 0.01)
 100         genoData <- roundAlleleDosage(genoData)
 101
 102         message("No. of geno missing values, ", sum(is.na(genoData)))
 103         if (sum(is.na(genoData)) > 0) {
 104             genoDataMissing <- c("yes")
 105             genoData <- na.roughfix(genoData)
 106         }
 107
 108         genoData <- data.frame(genoData)
 109     }
 110 }
 111
 112 set.seed(235)
 113
 114 clusterDataNotScaled <- c()
 115
 116 if (grepl("genotype", dataType, ignore.case = TRUE)) {
 117     clusterData <- extractGenotype(inputFiles)
 118
 119     pca <- prcomp(clusterData, retx = TRUE)
 120     pca <- summary(pca)
 121
 122     variances <- data.frame(pca$importance)
 123
 124     varProp <- variances[3, ]
 125     varProp <- data.frame(t(varProp))
 126     names(varProp) <- c("cumVar")
 127
 128     selectPcs <- varProp %>%
 129         filter(cumVar <= 0.9)
 130     pcsCnt <- nrow(selectPcs)
 131
 132     reportNotes <- paste0("Before clustering this dataset, principal component analysis (PCA) was perforemd on it to reduce the number of variables (dimensions). ")
 133     reportNotes <- paste0(reportNotes, "Based on the PCA, ", pcsCnt, " PCs were used to do the clustering. ")
 134     reportNotes <- paste0(reportNotes, "\n\nThe ", pcsCnt, " PCs explain 90% of the variance in the original dataset.",
 135         "\n")
 136
 137     scores <- data.frame(pca$x)
 138     scores <- scores[, 1:pcsCnt]
 139     scores <- round(scores, 3)
 140
 141     variances <- variances[2, 1:pcsCnt]
 142     variances <- round(variances, 4) * 100
 143     variances <- data.frame(t(variances))
 144
 145     clusterData <- scores
 146 } else {
 147
 148     if (grepl("gebv", dataType, ignore.case = TRUE)) {
 149         gebvsFile <- grep("combined_gebvs", inputFiles, value = TRUE)
 150         gebvsData <- data.frame(fread(gebvsFile, header = TRUE))
 151
 152         clusterNa <- gebvsData %>%
 153             filter_all(any_vars(is.na(.)))
 154         clusterData <- column_to_rownames(gebvsData, "V1")
 155     } else if (grepl("phenotype", dataType, ignore.case = TRUE)) {
 156
 157         metaFile <- grep("meta", inputFiles, value = TRUE)
 158
 159         clusterData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
 160
 161         if (!is.na(predictedTraits) && length(predictedTraits) > 1) {
 162             clusterData <- rownames_to_column(clusterData, var = "germplasmName")
 163             clusterData <- clusterData %>%
 164                 select(c(germplasmName, predictedTraits))
 165             clusterData <- column_to_rownames(clusterData, var = "germplasmName")
 166         }
 167     }
 168
 169     clusterData <- clusterData[, apply(clusterData, 2, function(x) var(x, na.rm=TRUE)) != 0]
 170     clusterDataNotScaled <- na.omit(clusterData)
 171     clusterData <- scale(clusterDataNotScaled, center = TRUE, scale = TRUE)
 172     clusterData <- round(clusterData, 3)
 173     reportNotes <- paste0(reportNotes, "Note: Data was standardized before clustering.",
 174         "\n")
 175 }
 176 sIndexFile <- grep("selection_index", inputFiles, value = TRUE)
 177 selectedIndexGenotypes <- c()
 178
 179 if (length(sIndexFile) != 0) {
 180     sIndexData <- data.frame(fread(sIndexFile, header = TRUE))
 181     selectionProp <- selectionProp * 0.01
 182     selectedIndexGenotypes <- sIndexData %>%
 183         top_frac(selectionProp)
 184
 185     selectedIndexGenotypes <- column_to_rownames(selectedIndexGenotypes, var = "V1")
 186
 187     if (!is.null(selectedIndexGenotypes)) {
 188         clusterData <- rownames_to_column(clusterData, var = "germplasmName")
 189         clusterData <- clusterData %>%
 190             filter(germplasmName %in% rownames(selectedIndexGenotypes))
 191
 192         clusterData <- column_to_rownames(clusterData, var = "germplasmName")
 193     }
 194 }
 195
 196 kMeansOut <- kmeansruns(clusterData, runs = 10)
 197 kCenters <- kMeansOut$bestk
 198
 199 if (!is.na(userKNumbers)) {
 200     if (userKNumbers != 0) {
 201         kCenters <- as.integer(userKNumbers)
 202         reportNotes <- paste0(reportNotes, "\n\nThe data was partitioned into ", userKNumbers,
 203             " clusters.\n")
 204     }
 205 }
 206
 207 reportNotes <- paste0(reportNotes, "\n\nAccording the kmeansruns algorithm from the fpc R package, the recommended number of clusters (k) for this data set is: ",
 208     kCenters, "\n\nYou can also check the Elbow plot to evaluate how many clusters may be better suited for your purpose.")
 209
 210 kMeansOut <- kmeans(clusterData, centers = kCenters, nstart = 10)
 211 kClusters <- data.frame(kMeansOut$cluster)
 212 kClusters <- rownames_to_column(kClusters)
 213 names(kClusters) <- c("germplasmName", "Cluster")
 214
 215 if (!is.null(clusterDataNotScaled)) {
 216     clusterDataNotScaled <- rownames_to_column(clusterDataNotScaled, var = "germplasmName")
 217
 218     clusteredData <- inner_join(kClusters, clusterDataNotScaled, by = "germplasmName")
 219 } else if (!is.null(selectedIndexGenotypes)) {
 220     selectedIndexGenotypes <- rownames_to_column(selectedIndexGenotypes, var = "germplasmName")
 221     clusteredData <- inner_join(kClusters, selectedIndexGenotypes, by = "germplasmName")
 222 } else {
 223     clusteredData <- kClusters
 224 }
 225
 226 clusteredData <- clusteredData %>%
 227     mutate_if(is.double, round, 2) %>%
 228     arrange(Cluster)
 229
 230 # print(paste('size: ', '\n', '\n', round(kMeansOut$size, 2)))
 231 # print(paste('centers: ','\n', round(kMeansOut$centers, 2)))
 232
 233 if (length(elbowPlotFile) && !file.info(elbowPlotFile)$size) {
 234     message("running elbow method...")
 235     png(elbowPlotFile)
 236     print(fviz_nbclust(clusterData, k.max = 20, FUNcluster = kmeans, method = "wss"))
 237     dev.off()
 238 }
 239
 240 png(kmeansPlotFile)
 241 ggplot2::autoplot(kMeansOut, data = clusterData, frame = TRUE, x = 1, y = 2)
 242 # fviz_cluster(kMeansOut, geom = "point", main = "", data = clusterData)
 243 dev.off()
 244
 245 clusterMeans <- c()
 246 if (!grepl('genotype', kResultFile)) {
 247     message('adding cluster means to clusters...')
 248     clusterMeans <- aggregate(clusterDataNotScaled, by = list(cluster = kMeansOut$cluster),
 249     mean)
 250
 251     clusterMeans <- clusterMeans %>%
 252         select(-germplasmName) %>%
 253         mutate_if(is.double, round, 2)
 254
 255 }
 256
 257 cat(reportNotes, file = reportFile, sep = "\n", append = TRUE)
 258
 259 variances <- paste0("Variances output: ")
 260 variances <- append(variances, (paste0("\nBetween clusters sum of squares (betweenss): ",
 261     "\t", round(kMeansOut$betweenss, 2))))
 262 withinss <- round(kMeansOut$withinss, 2)
 263 withinss <- c(paste0("\nWithin clusters sum of squares (withinss): "), "\t", paste("\n\t",
 264     withinss))
 265 variances <- append(variances, withinss)
 266 variances <- append(variances, (paste0("\nTotal within cluster sum of squares (tot.withinss): ",
 267     "\t", round(kMeansOut$tot.withinss, 2))))
 268
 269 variances <- append(variances, (paste0("\nTotal sum of squares (totss): ", "\t",
 270     round(kMeansOut$totss, 2))))
 271
 272 cat(variances, file = variancesFile, sep = "", append = TRUE)
 273
 274 if (length(genoFiles) > 1) {
 275     fwrite(genoData, file = combinedDataFile, sep = "\t", row.names = TRUE, quote = FALSE,
 276         )
 277 }
 278
 279 if (length(kResultFile)) {
 280     fwrite(clusteredData, file = kResultFile, sep = "\t", row.names = FALSE, quote = FALSE,
 281         )
 282 }
 283
 284 if (length(clusterMeansFile) && !is.null(clusterMeans)) {
 285     fwrite(clusterMeans, file = clusterMeansFile, sep = "\t", row.names = FALSE,
 286         quote = FALSE, )
 287 }
 288
 289 message("Done clustering.")
 290
 291
 292 ####
 293 q(save = "no", runLast = FALSE)
 294 ####