3 #runs correlation analysis.
4 #Correlation coeffiecients are stored in tabular and json formats
7 # Isaak Y Tecle (iyt2@cornell.edu)
16 library(phenoAnalysis
)
22 allArgs
<- commandArgs()
24 outputFiles
<- scan(grep("output_files", allArgs
, value
= TRUE),
26 inputFiles
<- scan(grep("input_files", allArgs
, value
= TRUE),
29 message('inputFiles: ', inputFiles
)
32 correCoefsTableFile
<- c()
33 correCoefsJsonFile
<- c()
35 if (any(grepl("phenotype_data", inputFiles
))) {
36 refererQtl
<- grep("qtl", inputFiles
, value
=TRUE)
38 phenoDataFile
<- grep("\\/phenotype_data", inputFiles
, value
=TRUE)
39 formattedPhenoFile
<- grep("formatted_phenotype_data", inputFiles
, fixed
= FALSE, value
= TRUE)
40 metaFile
<- grep("metadata", inputFiles
, value
=TRUE)
41 correCoefsTableFile
<- grep("pheno_corr_table", outputFiles
, value
=TRUE)
42 correCoefsJsonFile
<- grep("pheno_corr_json", outputFiles
, value
=TRUE)
44 formattedPhenoData
<- c()
47 if ( length(refererQtl
) != 0 ) {
48 phenoDataFile
<- grep("\\/phenodata", inputFiles
, value
=TRUE)
49 phenoData
<- data
.frame(fread(phenoDataFile
,
52 na
.strings
=c("NA", "-", " ", ".", "..")))
55 metaData
<- scan(metaFile
, what
="character")
61 if (length(refererQtl
) != 0) {
62 allNames
<- names(phenoData
)
63 nonTraitNames
<- c("ID")
64 allTraitNames
<- allNames
[! allNames
%in% nonTraitNames
]
69 if (length(refererQtl
) == 0 ) {
70 averagedPhenoData
<- cleanAveragePhenotypes(inputFiles
, metaDataFile
= metaFile
)
71 allNames
<- names(averagedPhenoData
)
72 nonTraitNames
<- metaData
73 allTraitNames
<- allNames
[! allNames
%in% nonTraitNames
]
75 rownames(averagedPhenoData
) <- NULL
76 correPhenoData
<- averagedPhenoData
79 correPhenoData
<- phenoData
%>%
81 summarise_if(is
.numeric
, mean
, na
.rm
=TRUE) %>%
88 correInputData
<- correPhenoData
89 print(head(correPhenoData
))
91 } else if (any(grepl("combined_gebvs", inputFiles
)) ||
92 any(grepl("selection_index", inputFiles
))) {
94 correCoefsTableFile
<- grep("genetic_corr_table", outputFiles
, value
=TRUE)
95 correCoefsJsonFile
<- grep("genetic_corr_json", outputFiles
, value
=TRUE)
96 geneticDataFile
<- grep("combined_gebvs", inputFiles
, value
=TRUE)
97 selectionIndexFile
<- grep("selection_index", inputFiles
, value
=TRUE)
99 message('selectionIndexFile', selectionIndexFile
)
100 message('geneticDataFile', geneticDataFile
)
102 geneticData
<- read
.table(geneticDataFile
,
106 na
.strings
= c("NA"),
111 if (length(selectionIndexFile
) != 0
112 && file
.info(selectionIndexFile
)$size
!= 0) {
113 indexData
<- read
.table(selectionIndexFile
,
117 na
.strings
= c("NA"),
124 if (!is
.null(indexData
)) {
125 geneticData
<- rownames_to_column(geneticData
, var
="genotypes")
126 indexData
<- rownames_to_column(indexData
, var
="genotypes")
128 geneticData
<- geneticData
%>% arrange(genotypes
)
129 indexData
<- indexData
%>% arrange(genotypes
)
131 corrData
<- full_join(geneticData
, indexData
)
132 corrData
<- column_to_rownames(corrData
, var
="genotypes")
135 corrData
<- geneticData
138 correInputData
<- corrData
142 if (is
.null(correInputData
)) {
143 stop("Can't run correlation analysis. There is no input data.")
146 correInputData
<- correInputData
%>%
147 select(where(~n_distinct(.) > 2))
149 coefpvalues
<- rcor
.test(correInputData
,
154 coefs
<- coefpvalues$cor
.mat
155 #remove rows and columns that are all "NA"
156 coefs
<- data
.frame(coefs
)
157 if (any(is
.na(coefs
))) {
158 coefs
<- coefs
[ , colSums(is
.na(coefs
)) < nrow(coefs
)]
161 coefs
[upper
.tri(coefs
)] <- NA
162 pvalues
<- coefpvalues$cor
.mat
163 pvalues
[lower
.tri(pvalues
)] <- coefpvalues$p
.values
[, 3]
164 pvalues
<- round(pvalues
, 3)
165 pvalues
[upper
.tri(pvalues
)] <- NA
166 pvalues
<- data
.frame(pvalues
)
168 allcordata
<- coefpvalues$cor
.mat
169 allcordata
[upper
.tri(allcordata
)] <- coefpvalues$p
.values
[, 3]
170 diag(allcordata
) <- 1
171 allcordata
<- round(allcordata
, 3)
173 traits
<- colnames(coefs
)
175 correlationList
<- list(
181 correlationJson
<- jsonlite
::toJSON(correlationList
)
183 write
.table(allcordata
,
184 file
= correCoefsTableFile
,
190 write(correlationJson
,
191 file
= correCoefsJsonFile
)
193 message("Done running correlation.")
194 q(save
= "no", runLast
= FALSE)