3 # runs k-means cluster analysis
5 # AUTHOR Isaak Y Tecle (iyt2@cornell.edu)
12 library(genoDataFilter
)
20 library(phenoAnalysis
)
23 allArgs
<- commandArgs()
25 outputFiles
<- grep("output_files", allArgs
, value
= TRUE)
26 outputFiles
<- scan(outputFiles
, what
= "character")
28 inputFiles
<- grep("input_files", allArgs
, value
= TRUE)
29 inputFiles
<- scan(inputFiles
, what
= "character")
31 optionsFile
<- grep("options", inputFiles
, value
= TRUE)
32 clusterOptions
<- read
.table(optionsFile
, header
= TRUE, sep
= "\t", stringsAsFactors
= FALSE,
36 kmeansPlotFile
<- grep("k-means_plot", outputFiles
, value
= TRUE)
37 kResultFile
<- grep("result", outputFiles
, value
= TRUE)
38 elbowPlotFile
<- grep("elbow_plot", outputFiles
, value
= TRUE)
39 clusterMeansFile
<- grep("k-means_means", outputFiles
, value
= TRUE)
40 variancesFile
<- grep("k-means_variances", outputFiles
, value
= TRUE)
41 reportFile
<- grep("report", outputFiles
, value
= TRUE)
42 errorFile
<- grep("error", outputFiles
, value
= TRUE)
44 combinedDataFile
<- grep("combined_cluster_data_file", outputFiles
, value
= TRUE)
46 clusterOptions
<- column_to_rownames(clusterOptions
, var
= "Params")
47 userKNumbers
<- as
.numeric(clusterOptions
["k_numbers", 1])
48 dataType
<- clusterOptions
["data_type", 1]
49 selectionProp
<- as
.numeric(clusterOptions
["selection_proportion", 1])
50 predictedTraits
<- clusterOptions
["predicted_traits", 1]
51 predictedTraits
<- unlist(strsplit(predictedTraits
, ","))
53 if (is
.null(kResultFile
)) {
54 stop("Clustering output file is missing.")
62 genoDataMissing
<- c()
64 extractGenotype
<- function(inputFiles
) {
66 genoFiles
<- grep("genotype_data", inputFiles
, value
= TRUE)
69 filteredGenoFile
<- c()
71 if (length(genoFiles
) > 1) {
72 genoData
<- combineGenoData(genoFiles
)
74 genoMetaData
<- genoData$trial
75 genoData$trial
<- NULL
79 genoData
<- fread(genoFile
, header
= TRUE, na
.strings
= c("NA", " ", "--",
82 if (is
.null(genoData
)) {
83 filteredGenoFile
<- grep("filtered_genotype_data_", genoFile
, value
= TRUE)
84 genoData
<- fread(filteredGenoFile
, header
= TRUE)
87 genoData
<- unique(genoData
, by
= "V1")
88 genoData
<- data
.frame(genoData
)
89 genoData
<- column_to_rownames(genoData
, "V1")
92 if (is
.null(genoData
)) {
93 stop("There is no genotype dataset.")
97 ## genoDataFilter::filterGenoData
98 genoData
<- convertToNumeric(genoData
)
99 genoData
<- filterGenoData(genoData
, maf
= 0.01)
100 genoData
<- roundAlleleDosage(genoData
)
102 message("No. of geno missing values, ", sum(is
.na(genoData
)))
103 if (sum(is
.na(genoData
)) > 0) {
104 genoDataMissing
<- c("yes")
105 genoData
<- na
.roughfix(genoData
)
108 genoData
<- data
.frame(genoData
)
114 clusterDataNotScaled
<- c()
116 if (grepl("genotype", dataType
, ignore
.case
= TRUE)) {
117 clusterData
<- extractGenotype(inputFiles
)
119 pca
<- prcomp(clusterData
, retx
= TRUE)
122 variances
<- data
.frame(pca$importance
)
124 varProp
<- variances
[3, ]
125 varProp
<- data
.frame(t(varProp
))
126 names(varProp
) <- c("cumVar")
128 selectPcs
<- varProp
%>%
129 filter(cumVar
<= 0.9)
130 pcsCnt
<- nrow(selectPcs
)
132 reportNotes
<- paste0("Before clustering this dataset, principal component analysis (PCA) was perforemd on it to reduce the number of variables (dimensions). ")
133 reportNotes
<- paste0(reportNotes
, "Based on the PCA, ", pcsCnt
, " PCs were used to do the clustering. ")
134 reportNotes
<- paste0(reportNotes
, "\n\nThe ", pcsCnt
, " PCs explain 90% of the variance in the original dataset.",
137 scores
<- data
.frame(pca$x
)
138 scores
<- scores
[, 1:pcsCnt
]
139 scores
<- round(scores
, 3)
141 variances
<- variances
[2, 1:pcsCnt
]
142 variances
<- round(variances
, 4) * 100
143 variances
<- data
.frame(t(variances
))
145 clusterData
<- scores
148 if (grepl("gebv", dataType
, ignore
.case
= TRUE)) {
149 gebvsFile
<- grep("combined_gebvs", inputFiles
, value
= TRUE)
150 gebvsData
<- data
.frame(fread(gebvsFile
, header
= TRUE))
152 clusterNa
<- gebvsData
%>%
153 filter_all(any_vars(is
.na(.)))
154 clusterData
<- column_to_rownames(gebvsData
, "V1")
155 } else if (grepl("phenotype", dataType
, ignore
.case
= TRUE)) {
157 metaFile
<- grep("meta", inputFiles
, value
= TRUE)
159 clusterData
<- cleanAveragePhenotypes(inputFiles
, metaDataFile
= metaFile
)
161 if (!is
.na(predictedTraits
) && length(predictedTraits
) > 1) {
162 clusterData
<- rownames_to_column(clusterData
, var
= "germplasmName")
163 clusterData
<- clusterData
%>%
164 select(c(germplasmName
, predictedTraits
))
165 clusterData
<- column_to_rownames(clusterData
, var
= "germplasmName")
169 clusterData
<- clusterData
[, apply(clusterData
, 2, function(x
) var(x
, na
.rm
=TRUE)) != 0]
170 clusterDataNotScaled
<- na
.omit(clusterData
)
171 clusterData
<- scale(clusterDataNotScaled
, center
= TRUE, scale
= TRUE)
172 clusterData
<- round(clusterData
, 3)
173 reportNotes
<- paste0(reportNotes
, "Note: Data was standardized before clustering.",
176 sIndexFile
<- grep("selection_index", inputFiles
, value
= TRUE)
177 selectedIndexGenotypes
<- c()
179 if (length(sIndexFile
) != 0) {
180 sIndexData
<- data
.frame(fread(sIndexFile
, header
= TRUE))
181 selectionProp
<- selectionProp
* 0.01
182 selectedIndexGenotypes
<- sIndexData
%>%
183 top_frac(selectionProp
)
185 selectedIndexGenotypes
<- column_to_rownames(selectedIndexGenotypes
, var
= "V1")
187 if (!is
.null(selectedIndexGenotypes
)) {
188 clusterData
<- rownames_to_column(clusterData
, var
= "germplasmName")
189 clusterData
<- clusterData
%>%
190 filter(germplasmName
%in% rownames(selectedIndexGenotypes
))
192 clusterData
<- column_to_rownames(clusterData
, var
= "germplasmName")
196 kMeansOut
<- kmeansruns(clusterData
, runs
= 10)
197 kCenters
<- kMeansOut$bestk
199 if (!is
.na(userKNumbers
)) {
200 if (userKNumbers
!= 0) {
201 kCenters
<- as
.integer(userKNumbers
)
202 reportNotes
<- paste0(reportNotes
, "\n\nThe data was partitioned into ", userKNumbers
,
207 reportNotes
<- paste0(reportNotes
, "\n\nAccording the kmeansruns algorithm from the fpc R package, the recommended number of clusters (k) for this data set is: ",
208 kCenters
, "\n\nYou can also check the Elbow plot to evaluate how many clusters may be better suited for your purpose.")
210 kMeansOut
<- kmeans(clusterData
, centers
= kCenters
, nstart
= 10)
211 kClusters
<- data
.frame(kMeansOut$cluster
)
212 kClusters
<- rownames_to_column(kClusters
)
213 names(kClusters
) <- c("germplasmName", "Cluster")
215 if (!is
.null(clusterDataNotScaled
)) {
216 clusterDataNotScaled
<- rownames_to_column(clusterDataNotScaled
, var
= "germplasmName")
218 clusteredData
<- inner_join(kClusters
, clusterDataNotScaled
, by
= "germplasmName")
219 } else if (!is
.null(selectedIndexGenotypes
)) {
220 selectedIndexGenotypes
<- rownames_to_column(selectedIndexGenotypes
, var
= "germplasmName")
221 clusteredData
<- inner_join(kClusters
, selectedIndexGenotypes
, by
= "germplasmName")
223 clusteredData
<- kClusters
226 clusteredData
<- clusteredData
%>%
227 mutate_if(is
.double
, round
, 2) %>%
230 # print(paste('size: ', '\n', '\n', round(kMeansOut$size, 2)))
231 # print(paste('centers: ','\n', round(kMeansOut$centers, 2)))
233 if (length(elbowPlotFile
) && !file
.info(elbowPlotFile
)$size
) {
234 message("running elbow method...")
236 print(fviz_nbclust(clusterData
, k
.max
= 20, FUNcluster
= kmeans
, method
= "wss"))
241 ggplot2
::autoplot(kMeansOut
, data
= clusterData
, frame
= TRUE, x
= 1, y
= 2)
242 # fviz_cluster(kMeansOut, geom = "point", main = "", data = clusterData)
246 if (!grepl('genotype', kResultFile
)) {
247 message('adding cluster means to clusters...')
248 clusterMeans
<- aggregate(clusterDataNotScaled
, by
= list(cluster
= kMeansOut$cluster
),
251 clusterMeans
<- clusterMeans
%>%
252 select(-germplasmName
) %>%
253 mutate_if(is
.double
, round
, 2)
257 cat(reportNotes
, file
= reportFile
, sep
= "\n", append
= TRUE)
259 variances
<- paste0("Variances output: ")
260 variances
<- append(variances
, (paste0("\nBetween clusters sum of squares (betweenss): ",
261 "\t", round(kMeansOut$betweenss
, 2))))
262 withinss
<- round(kMeansOut$withinss
, 2)
263 withinss
<- c(paste0("\nWithin clusters sum of squares (withinss): "), "\t", paste("\n\t",
265 variances
<- append(variances
, withinss
)
266 variances
<- append(variances
, (paste0("\nTotal within cluster sum of squares (tot.withinss): ",
267 "\t", round(kMeansOut$tot
.withinss
, 2))))
269 variances
<- append(variances
, (paste0("\nTotal sum of squares (totss): ", "\t",
270 round(kMeansOut$totss
, 2))))
272 cat(variances
, file
= variancesFile
, sep
= "", append
= TRUE)
274 if (length(genoFiles
) > 1) {
275 fwrite(genoData
, file
= combinedDataFile
, sep
= "\t", row
.names
= TRUE, quote
= FALSE,
279 if (length(kResultFile
)) {
280 fwrite(clusteredData
, file
= kResultFile
, sep
= "\t", row
.names
= FALSE, quote
= FALSE,
284 if (length(clusterMeansFile
) && !is
.null(clusterMeans
)) {
285 fwrite(clusterMeans
, file
= clusterMeansFile
, sep
= "\t", row
.names
= FALSE,
289 message("Done clustering.")
293 q(save
= "no", runLast
= FALSE)