R/solGS/hclustering.r

   1  #SNOPSIS
   2
   3  #runs k-means cluster analysis
   4
   5  #AUTHOR
   6  # Isaak Y Tecle (iyt2@cornell.edu)
   7
   8
   9 options(echo = FALSE)
  10
  11 library(randomForest)
  12 library(data.table)
  13 library(genoDataFilter)
  14 library(tibble)
  15 library(dplyr)
  16 library(ggfortify)
  17 library(tibble)
  18 library(stringi)
  19 library(phenoAnalysis)
  20 library(ggplot2)
  21 library(ape)
  22 library(ggtree)
  23 library(treeio)
  24 library(R2D3) #install_github("jamesthomson/R2D3")
  25
  26 allArgs <- commandArgs()
  27
  28 outputFiles  <- grep("output_files", allArgs, value = TRUE)
  29 outputFiles <- scan(outputFiles, what = "character")
  30
  31 inputFiles  <- grep("input_files", allArgs, value = TRUE)
  32 inputFiles <- scan(inputFiles, what = "character")
  33
  34 clusterFile <- grep("cluster", outputFiles, value = TRUE)
  35 newickFile <- grep("newick", outputFiles, value = TRUE)
  36 jsonFile <- grep("json", outputFiles, value = TRUE)
  37 reportFile  <- grep("report", outputFiles, value = TRUE)
  38 errorFile   <- grep("error", outputFiles, value = TRUE)
  39
  40 combinedDataFile <- grep("combined_cluster_data_file", outputFiles, value = TRUE)
  41
  42 plotFile <- grep("plot", outputFiles, value = TRUE)
  43 optionsFile    <- grep("options", inputFiles,  value = TRUE)
  44
  45 clusterOptions <- read.table(optionsFile,
  46                              header = TRUE,
  47                              sep = "\t",
  48                              stringsAsFactors = FALSE,
  49                              na.strings = "")
  50 print(clusterOptions)
  51 clusterOptions <- column_to_rownames(clusterOptions, var = "Params")
  52 # userKNumbers   <- as.numeric(clusterOptions["k_numbers", 1])
  53 dataType       <- clusterOptions["data_type", 1]
  54 selectionProp  <- as.numeric(clusterOptions["selection_proportion", 1])
  55 predictedTraits <- clusterOptions["predicted_traits", 1]
  56 predictedTraits <- unlist(strsplit(predictedTraits, ','))
  57
  58 if (is.null(newickFile) && is.null(jsonFile)) {
  59   stop("Hierarchical output file is missing.")
  60   q("no", 1, FALSE)
  61 }
  62
  63 clusterData <- c()
  64 genoData    <- c()
  65 genoFiles   <- c()
  66 reportNotes <- c()
  67 genoDataMissing <- c()
  68
  69 extractGenotype <- function(inputFiles) {
  70
  71     genoFiles <- grep("genotype_data", inputFiles,  value = TRUE)
  72
  73     genoMetaData <- c()
  74     filteredGenoFile <- c()
  75
  76     if (length(genoFiles) > 1) {
  77         genoData <- combineGenoData(genoFiles)
  78
  79         genoMetaData   <- genoData$trial
  80         genoData$trial <- NULL
  81
  82     } else {
  83         genoFile <- genoFiles
  84         genoData <- fread(genoFile,
  85                           header = TRUE,
  86                           na.strings = c("NA", " ", "--", "-", "."))
  87
  88         if (is.null(genoData)) {
  89             filteredGenoFile <- grep("filtered_genotype_data_",  genoFile, value = TRUE)
  90             genoData <- fread(filteredGenoFile, header = TRUE)
  91         }
  92
  93         genoData <- unique(genoData, by = "V1")
  94         genoData <- data.frame(genoData)
  95         genoData <- column_to_rownames(genoData, "V1")
  96     }
  97
  98     if (is.null(genoData)) {
  99         stop("There is no genotype dataset.")
 100         q("no", 1, FALSE)
 101     } else {
 102
 103         ##genoDataFilter::filterGenoData
 104         genoData <- convertToNumeric(genoData)
 105         genoData <- filterGenoData(genoData, maf=0.01)
 106         genoData <- roundAlleleDosage(genoData)
 107
 108         message("No. of geno missing values, ", sum(is.na(genoData)))
 109         if (sum(is.na(genoData)) > 0) {
 110             genoDataMissing <- c("yes")
 111             genoData <- na.roughfix(genoData)
 112         }
 113
 114         genoData <- data.frame(genoData)
 115     }
 116 }
 117
 118 set.seed(235)
 119
 120 clusterDataNotScaled <- c()
 121
 122 if (grepl("genotype", dataType, ignore.case = TRUE)) {
 123     clusterData <- extractGenotype(inputFiles)
 124
 125     pca    <- prcomp(clusterData, retx = TRUE)
 126     pca    <- summary(pca)
 127
 128     variances <- data.frame(pca$importance)
 129
 130     varProp        <- variances[3, ]
 131     varProp        <- data.frame(t(varProp))
 132     names(varProp) <- c("cumVar")
 133
 134     selectPcs <- varProp %>% filter(cumVar <= 0.9)
 135     pcsCnt    <- nrow(selectPcs)
 136
 137     reportNotes <- paste0("Before clustering this dataset, principal component analysis (PCA) was done on it. ", "\n")
 138     reportNotes <- paste0(reportNotes, "Based on the PCA, ", pcsCnt, " PCs are used to cluster this dataset. ", "\n")
 139     reportNotes <- paste0(reportNotes, "They explain 90% of the variance in the original dataset.", "\n")
 140
 141     scores   <- data.frame(pca$x)
 142     scores   <- scores[, 1:pcsCnt]
 143     scores   <- round(scores, 3)
 144
 145     variances <- variances[2, 1:pcsCnt]
 146     variances <- round(variances, 4) * 100
 147     variances <- data.frame(t(variances))
 148
 149     clusterData <- scores
 150 } else {
 151
 152     if (grepl("gebv", dataType, ignore.case = TRUE)) {
 153         gebvsFile <- grep("combined_gebvs", inputFiles,  value = TRUE)
 154         gebvsData <- data.frame(fread(gebvsFile, header = TRUE))
 155
 156         clusterNa   <- gebvsData %>% filter_all(any_vars(is.na(.)))
 157         clusterData <- column_to_rownames(gebvsData, 'V1')
 158     } else if (grepl("phenotype", dataType, ignore.case = TRUE)) {
 159
 160         metaFile <- grep("meta", inputFiles,  value = TRUE)
 161
 162         clusterData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
 163
 164         if (!is.na(predictedTraits) && length(predictedTraits) > 1) {
 165             clusterData <- rownames_to_column(clusterData, var = "germplasmName")
 166             clusterData <- clusterData %>% select(c(germplasmName, predictedTraits))
 167             clusterData <- column_to_rownames(clusterData, var = "germplasmName")
 168         }
 169     }
 170
 171     clusterDataNotScaled <- na.omit(clusterData)
 172
 173     clusterData <- scale(clusterDataNotScaled, center=TRUE, scale=TRUE)
 174     reportNotes <- paste0(reportNotes, "Note: Data was standardized before clustering.", "\n")
 175 }
 176
 177 sIndexFile <- grep("selection_index", inputFiles, value = TRUE)
 178 selectedIndexGenotypes <- c()
 179
 180 if (length(sIndexFile) != 0) {
 181     sIndexData <- data.frame(fread(sIndexFile, header = TRUE))
 182     selectionProp <- selectionProp * 0.01
 183     selectedIndexGenotypes <- sIndexData %>% top_frac(selectionProp)
 184
 185     selectedIndexGenotypes <- column_to_rownames(selectedIndexGenotypes, var = "V1")
 186
 187     if (!is.null(selectedIndexGenotypes)) {
 188         clusterData <- rownames_to_column(clusterData, var = "germplasmName")
 189         clusterData <- clusterData %>%
 190             filter(germplasmName %in% rownames(selectedIndexGenotypes))
 191
 192         clusterData <- column_to_rownames(clusterData, var = "germplasmName")
 193     }
 194 }
 195
 196 distMat <- clusterData %>%
 197                 dist(., method="euclidean")
 198
 199 distMat <- round(distMat, 3)
 200
 201 hClust <- distMat  %>%
 202                 hclust(., method="complete")
 203
 204
 205 distTable <- data.frame(as.matrix(distMat))
 206
 207 clustTree <- ggtree(hClust,  layout = "circular", color = "#96CA2D")
 208 xMax <- ggplot_build(clustTree)$layout$panel_scales_x[[1]]$range$range[2]
 209 xMax <- xMax + 0.02
 210
 211  # ggplot2::xlim(0, xMax)
 212 clustTree <- clustTree +
 213     geom_tiplab(size = 4, color = "blue")
 214
 215
 216 # geom_text(aes(x = branch, label = round(branch.length, 2)))
 217
 218 png(plotFile, height = 950, width = 950)
 219    print(clustTree)
 220 dev.off()
 221
 222
 223 cat(reportNotes, file = reportFile, sep = "\n", append = TRUE)
 224
 225 if (length(genoFiles) > 1) {
 226     fwrite(genoData,
 227        file      = combinedDataFile,
 228        sep       = "\t",
 229        quote     = FALSE,
 230        )
 231 }
 232
 233 # if (length(resultFile) != 0 ) {
 234 #     fwrite(clustTreeData,
 235 #        file      = resultFile,
 236 #        sep       = "\t",
 237 #        row.names = TRUE,
 238 #        quote     = FALSE,
 239 #        )
 240 # }
 241 message('newick file ', newickFile)
 242 newickF <- as.phylo(hClust)
 243 write.tree(phy = newickF,
 244 file = newickFile
 245 )
 246
 247 jsonHclust <- jsonHC(hClust)
 248 write(jsonHclust$json, file = jsonFile)
 249
 250 ####
 251 q(save = "no", runLast = FALSE)
 252 ####