Merge pull request #4719 from solgenomics/topic/fixing_histogram
[sgn.git] / R / solGS / hclustering.r
blob57073667e29cc1bd5dd6313f15613b2f96821e5c
1 #SNOPSIS
3 #runs k-means cluster analysis
5 #AUTHOR
6 # Isaak Y Tecle (iyt2@cornell.edu)
9 options(echo = FALSE)
11 library(randomForest)
12 library(data.table)
13 library(genoDataFilter)
14 library(tibble)
15 library(dplyr)
16 library(ggfortify)
17 library(tibble)
18 library(stringi)
19 library(phenoAnalysis)
20 library(ggplot2)
21 library(ape)
22 library(ggtree)
23 library(treeio)
24 library(R2D3) #install_github("jamesthomson/R2D3")
26 allArgs <- commandArgs()
28 outputFiles <- grep("output_files", allArgs, value = TRUE)
29 outputFiles <- scan(outputFiles, what = "character")
31 inputFiles <- grep("input_files", allArgs, value = TRUE)
32 inputFiles <- scan(inputFiles, what = "character")
34 clusterFile <- grep("cluster", outputFiles, value = TRUE)
35 newickFile <- grep("newick", outputFiles, value = TRUE)
36 jsonFile <- grep("json", outputFiles, value = TRUE)
37 reportFile <- grep("report", outputFiles, value = TRUE)
38 errorFile <- grep("error", outputFiles, value = TRUE)
40 combinedDataFile <- grep("combined_cluster_data_file", outputFiles, value = TRUE)
42 plotFile <- grep("plot", outputFiles, value = TRUE)
43 optionsFile <- grep("options", inputFiles, value = TRUE)
45 clusterOptions <- read.table(optionsFile,
46 header = TRUE,
47 sep = "\t",
48 stringsAsFactors = FALSE,
49 na.strings = "")
50 print(clusterOptions)
51 clusterOptions <- column_to_rownames(clusterOptions, var = "Params")
52 # userKNumbers <- as.numeric(clusterOptions["k_numbers", 1])
53 dataType <- clusterOptions["data_type", 1]
54 selectionProp <- as.numeric(clusterOptions["selection_proportion", 1])
55 predictedTraits <- clusterOptions["predicted_traits", 1]
56 predictedTraits <- unlist(strsplit(predictedTraits, ','))
58 if (is.null(newickFile) && is.null(jsonFile)) {
59 stop("Hierarchical output file is missing.")
60 q("no", 1, FALSE)
63 clusterData <- c()
64 genoData <- c()
65 genoFiles <- c()
66 reportNotes <- c()
67 genoDataMissing <- c()
69 extractGenotype <- function(inputFiles) {
71 genoFiles <- grep("genotype_data", inputFiles, value = TRUE)
73 genoMetaData <- c()
74 filteredGenoFile <- c()
76 if (length(genoFiles) > 1) {
77 genoData <- combineGenoData(genoFiles)
79 genoMetaData <- genoData$trial
80 genoData$trial <- NULL
82 } else {
83 genoFile <- genoFiles
84 genoData <- fread(genoFile,
85 header = TRUE,
86 na.strings = c("NA", " ", "--", "-", "."))
88 if (is.null(genoData)) {
89 filteredGenoFile <- grep("filtered_genotype_data_", genoFile, value = TRUE)
90 genoData <- fread(filteredGenoFile, header = TRUE)
93 genoData <- unique(genoData, by = "V1")
94 genoData <- data.frame(genoData)
95 genoData <- column_to_rownames(genoData, "V1")
98 if (is.null(genoData)) {
99 stop("There is no genotype dataset.")
100 q("no", 1, FALSE)
101 } else {
103 ##genoDataFilter::filterGenoData
104 genoData <- convertToNumeric(genoData)
105 genoData <- filterGenoData(genoData, maf=0.01)
106 genoData <- roundAlleleDosage(genoData)
108 message("No. of geno missing values, ", sum(is.na(genoData)))
109 if (sum(is.na(genoData)) > 0) {
110 genoDataMissing <- c("yes")
111 genoData <- na.roughfix(genoData)
114 genoData <- data.frame(genoData)
118 set.seed(235)
120 clusterDataNotScaled <- c()
122 if (grepl("genotype", dataType, ignore.case = TRUE)) {
123 clusterData <- extractGenotype(inputFiles)
125 pca <- prcomp(clusterData, retx = TRUE)
126 pca <- summary(pca)
128 variances <- data.frame(pca$importance)
130 varProp <- variances[3, ]
131 varProp <- data.frame(t(varProp))
132 names(varProp) <- c("cumVar")
134 selectPcs <- varProp %>% filter(cumVar <= 0.9)
135 pcsCnt <- nrow(selectPcs)
137 reportNotes <- paste0("Before clustering this dataset, principal component analysis (PCA) was done on it. ", "\n")
138 reportNotes <- paste0(reportNotes, "Based on the PCA, ", pcsCnt, " PCs are used to cluster this dataset. ", "\n")
139 reportNotes <- paste0(reportNotes, "They explain 90% of the variance in the original dataset.", "\n")
141 scores <- data.frame(pca$x)
142 scores <- scores[, 1:pcsCnt]
143 scores <- round(scores, 3)
145 variances <- variances[2, 1:pcsCnt]
146 variances <- round(variances, 4) * 100
147 variances <- data.frame(t(variances))
149 clusterData <- scores
150 } else {
152 if (grepl("gebv", dataType, ignore.case = TRUE)) {
153 gebvsFile <- grep("combined_gebvs", inputFiles, value = TRUE)
154 gebvsData <- data.frame(fread(gebvsFile, header = TRUE))
156 clusterNa <- gebvsData %>% filter_all(any_vars(is.na(.)))
157 clusterData <- column_to_rownames(gebvsData, 'V1')
158 } else if (grepl("phenotype", dataType, ignore.case = TRUE)) {
160 metaFile <- grep("meta", inputFiles, value = TRUE)
162 clusterData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
164 if (!is.na(predictedTraits) && length(predictedTraits) > 1) {
165 clusterData <- rownames_to_column(clusterData, var = "germplasmName")
166 clusterData <- clusterData %>% select(c(germplasmName, predictedTraits))
167 clusterData <- column_to_rownames(clusterData, var = "germplasmName")
171 clusterDataNotScaled <- na.omit(clusterData)
173 clusterData <- scale(clusterDataNotScaled, center=TRUE, scale=TRUE)
174 reportNotes <- paste0(reportNotes, "Note: Data was standardized before clustering.", "\n")
177 sIndexFile <- grep("selection_index", inputFiles, value = TRUE)
178 selectedIndexGenotypes <- c()
180 if (length(sIndexFile) != 0) {
181 sIndexData <- data.frame(fread(sIndexFile, header = TRUE))
182 selectionProp <- selectionProp * 0.01
183 selectedIndexGenotypes <- sIndexData %>% top_frac(selectionProp)
185 selectedIndexGenotypes <- column_to_rownames(selectedIndexGenotypes, var = "V1")
187 if (!is.null(selectedIndexGenotypes)) {
188 clusterData <- rownames_to_column(clusterData, var = "germplasmName")
189 clusterData <- clusterData %>%
190 filter(germplasmName %in% rownames(selectedIndexGenotypes))
192 clusterData <- column_to_rownames(clusterData, var = "germplasmName")
196 distMat <- clusterData %>%
197 dist(., method="euclidean")
199 distMat <- round(distMat, 3)
201 hClust <- distMat %>%
202 hclust(., method="complete")
205 distTable <- data.frame(as.matrix(distMat))
207 clustTree <- ggtree(hClust, layout = "circular", color = "#96CA2D")
208 xMax <- ggplot_build(clustTree)$layout$panel_scales_x[[1]]$range$range[2]
209 xMax <- xMax + 0.02
211 # ggplot2::xlim(0, xMax)
212 clustTree <- clustTree +
213 geom_tiplab(size = 4, color = "blue")
216 # geom_text(aes(x = branch, label = round(branch.length, 2)))
218 png(plotFile, height = 950, width = 950)
219 print(clustTree)
220 dev.off()
223 cat(reportNotes, file = reportFile, sep = "\n", append = TRUE)
225 if (length(genoFiles) > 1) {
226 fwrite(genoData,
227 file = combinedDataFile,
228 sep = "\t",
229 quote = FALSE,
233 # if (length(resultFile) != 0 ) {
234 # fwrite(clustTreeData,
235 # file = resultFile,
236 # sep = "\t",
237 # row.names = TRUE,
238 # quote = FALSE,
241 message('newick file ', newickFile)
242 newickF <- as.phylo(hClust)
243 write.tree(phy = newickF,
244 file = newickFile
247 jsonHclust <- jsonHC(hClust)
248 write(jsonHclust$json, file = jsonFile)
250 ####
251 q(save = "no", runLast = FALSE)
252 ####