3 #runs k-means cluster analysis
6 # Isaak Y Tecle (iyt2@cornell.edu)
13 library(genoDataFilter
)
19 library(phenoAnalysis
)
24 library(R2D3
) #install_github("jamesthomson/R2D3")
26 allArgs
<- commandArgs()
28 outputFiles
<- grep("output_files", allArgs
, value
= TRUE)
29 outputFiles
<- scan(outputFiles
, what
= "character")
31 inputFiles
<- grep("input_files", allArgs
, value
= TRUE)
32 inputFiles
<- scan(inputFiles
, what
= "character")
34 clusterFile
<- grep("cluster", outputFiles
, value
= TRUE)
35 newickFile
<- grep("newick", outputFiles
, value
= TRUE)
36 jsonFile
<- grep("json", outputFiles
, value
= TRUE)
37 reportFile
<- grep("report", outputFiles
, value
= TRUE)
38 errorFile
<- grep("error", outputFiles
, value
= TRUE)
40 combinedDataFile
<- grep("combined_cluster_data_file", outputFiles
, value
= TRUE)
42 plotFile
<- grep("plot", outputFiles
, value
= TRUE)
43 optionsFile
<- grep("options", inputFiles
, value
= TRUE)
45 clusterOptions
<- read
.table(optionsFile
,
48 stringsAsFactors
= FALSE,
51 clusterOptions
<- column_to_rownames(clusterOptions
, var
= "Params")
52 # userKNumbers <- as.numeric(clusterOptions["k_numbers", 1])
53 dataType
<- clusterOptions
["data_type", 1]
54 selectionProp
<- as
.numeric(clusterOptions
["selection_proportion", 1])
55 predictedTraits
<- clusterOptions
["predicted_traits", 1]
56 predictedTraits
<- unlist(strsplit(predictedTraits
, ','))
58 if (is
.null(newickFile
) && is
.null(jsonFile
)) {
59 stop("Hierarchical output file is missing.")
67 genoDataMissing
<- c()
69 extractGenotype
<- function(inputFiles
) {
71 genoFiles
<- grep("genotype_data", inputFiles
, value
= TRUE)
74 filteredGenoFile
<- c()
76 if (length(genoFiles
) > 1) {
77 genoData
<- combineGenoData(genoFiles
)
79 genoMetaData
<- genoData$trial
80 genoData$trial
<- NULL
84 genoData
<- fread(genoFile
,
86 na
.strings
= c("NA", " ", "--", "-", "."))
88 if (is
.null(genoData
)) {
89 filteredGenoFile
<- grep("filtered_genotype_data_", genoFile
, value
= TRUE)
90 genoData
<- fread(filteredGenoFile
, header
= TRUE)
93 genoData
<- unique(genoData
, by
= "V1")
94 genoData
<- data
.frame(genoData
)
95 genoData
<- column_to_rownames(genoData
, "V1")
98 if (is
.null(genoData
)) {
99 stop("There is no genotype dataset.")
103 ##genoDataFilter::filterGenoData
104 genoData
<- convertToNumeric(genoData
)
105 genoData
<- filterGenoData(genoData
, maf
=0.01)
106 genoData
<- roundAlleleDosage(genoData
)
108 message("No. of geno missing values, ", sum(is
.na(genoData
)))
109 if (sum(is
.na(genoData
)) > 0) {
110 genoDataMissing
<- c("yes")
111 genoData
<- na
.roughfix(genoData
)
114 genoData
<- data
.frame(genoData
)
120 clusterDataNotScaled
<- c()
122 if (grepl("genotype", dataType
, ignore
.case
= TRUE)) {
123 clusterData
<- extractGenotype(inputFiles
)
125 pca
<- prcomp(clusterData
, retx
= TRUE)
128 variances
<- data
.frame(pca$importance
)
130 varProp
<- variances
[3, ]
131 varProp
<- data
.frame(t(varProp
))
132 names(varProp
) <- c("cumVar")
134 selectPcs
<- varProp
%>% filter(cumVar
<= 0.9)
135 pcsCnt
<- nrow(selectPcs
)
137 reportNotes
<- paste0("Before clustering this dataset, principal component analysis (PCA) was done on it. ", "\n")
138 reportNotes
<- paste0(reportNotes
, "Based on the PCA, ", pcsCnt
, " PCs are used to cluster this dataset. ", "\n")
139 reportNotes
<- paste0(reportNotes
, "They explain 90% of the variance in the original dataset.", "\n")
141 scores
<- data
.frame(pca$x
)
142 scores
<- scores
[, 1:pcsCnt
]
143 scores
<- round(scores
, 3)
145 variances
<- variances
[2, 1:pcsCnt
]
146 variances
<- round(variances
, 4) * 100
147 variances
<- data
.frame(t(variances
))
149 clusterData
<- scores
152 if (grepl("gebv", dataType
, ignore
.case
= TRUE)) {
153 gebvsFile
<- grep("combined_gebvs", inputFiles
, value
= TRUE)
154 gebvsData
<- data
.frame(fread(gebvsFile
, header
= TRUE))
156 clusterNa
<- gebvsData
%>% filter_all(any_vars(is
.na(.)))
157 clusterData
<- column_to_rownames(gebvsData
, 'V1')
158 } else if (grepl("phenotype", dataType
, ignore
.case
= TRUE)) {
160 metaFile
<- grep("meta", inputFiles
, value
= TRUE)
162 clusterData
<- cleanAveragePhenotypes(inputFiles
, metaDataFile
= metaFile
)
164 if (!is
.na(predictedTraits
) && length(predictedTraits
) > 1) {
165 clusterData
<- rownames_to_column(clusterData
, var
= "germplasmName")
166 clusterData
<- clusterData
%>% select(c(germplasmName
, predictedTraits
))
167 clusterData
<- column_to_rownames(clusterData
, var
= "germplasmName")
171 clusterDataNotScaled
<- na
.omit(clusterData
)
173 clusterData
<- scale(clusterDataNotScaled
, center
=TRUE, scale
=TRUE)
174 reportNotes
<- paste0(reportNotes
, "Note: Data was standardized before clustering.", "\n")
177 sIndexFile
<- grep("selection_index", inputFiles
, value
= TRUE)
178 selectedIndexGenotypes
<- c()
180 if (length(sIndexFile
) != 0) {
181 sIndexData
<- data
.frame(fread(sIndexFile
, header
= TRUE))
182 selectionProp
<- selectionProp
* 0.01
183 selectedIndexGenotypes
<- sIndexData
%>% top_frac(selectionProp
)
185 selectedIndexGenotypes
<- column_to_rownames(selectedIndexGenotypes
, var
= "V1")
187 if (!is
.null(selectedIndexGenotypes
)) {
188 clusterData
<- rownames_to_column(clusterData
, var
= "germplasmName")
189 clusterData
<- clusterData
%>%
190 filter(germplasmName
%in% rownames(selectedIndexGenotypes
))
192 clusterData
<- column_to_rownames(clusterData
, var
= "germplasmName")
196 distMat
<- clusterData
%>%
197 dist(., method
="euclidean")
199 distMat
<- round(distMat
, 3)
201 hClust
<- distMat
%>%
202 hclust(., method
="complete")
205 distTable
<- data
.frame(as
.matrix(distMat
))
207 clustTree
<- ggtree(hClust
, layout
= "circular", color
= "#96CA2D")
208 xMax
<- ggplot_build(clustTree
)$layout$panel_scales_x
[[1]]$range$range
[2]
211 # ggplot2::xlim(0, xMax)
212 clustTree
<- clustTree
+
213 geom_tiplab(size
= 4, color
= "blue")
216 # geom_text(aes(x = branch, label = round(branch.length, 2)))
218 png(plotFile
, height
= 950, width
= 950)
223 cat(reportNotes
, file
= reportFile
, sep
= "\n", append
= TRUE)
225 if (length(genoFiles
) > 1) {
227 file
= combinedDataFile
,
233 # if (length(resultFile) != 0 ) {
234 # fwrite(clustTreeData,
241 message('newick file ', newickFile
)
242 newickF
<- as
.phylo(hClust
)
243 write
.tree(phy
= newickF
,
247 jsonHclust
<- jsonHC(hClust
)
248 write(jsonHclust$json
, file
= jsonFile
)
251 q(save
= "no", runLast
= FALSE)