3 #runs correlation analysis.
4 #Correlation coeffiecients are stored in tabular and json formats
7 # Isaak Y Tecle (iyt2@cornell.edu)
16 library(phenoAnalysis
)
22 allArgs
<- commandArgs()
24 outputFiles
<- scan(grep("output_files", allArgs
, value
= TRUE),
26 inputFiles
<- scan(grep("input_files", allArgs
, value
= TRUE),
29 message('inputFiles: ', inputFiles
)
32 correCoefsTableFile
<- c()
33 correCoefsJsonFile
<- c()
35 if (any(grepl("phenotype_data", inputFiles
))) {
36 refererQtl
<- grep("qtl", inputFiles
, value
=TRUE)
38 phenoDataFile
<- grep("\\/phenotype_data", inputFiles
, value
=TRUE)
39 formattedPhenoFile
<- grep("formatted_phenotype_data", inputFiles
, fixed
= FALSE, value
= TRUE)
40 metaFile
<- grep("metadata", inputFiles
, value
=TRUE)
41 correCoefsTableFile
<- grep("pheno_corr_table", outputFiles
, value
=TRUE)
42 correCoefsJsonFile
<- grep("pheno_corr_json", outputFiles
, value
=TRUE)
44 formattedPhenoData
<- c()
47 if ( length(refererQtl
) != 0 ) {
48 phenoDataFile
<- grep("\\/phenodata", inputFiles
, value
=TRUE)
49 phenoData
<- data
.frame(fread(phenoDataFile
,
52 na
.strings
=c("NA", "-", " ", ".", "..")))
55 metaData
<- scan(metaFile
, what
="character")
61 if (length(refererQtl
) != 0) {
62 allNames
<- names(phenoData
)
63 nonTraitNames
<- c("ID")
64 allTraitNames
<- allNames
[! allNames
%in% nonTraitNames
]
69 if (length(refererQtl
) == 0 ) {
70 averagedPhenoData
<- cleanAveragePhenotypes(inputFiles
, metaDataFile
= metaFile
)
71 allNames
<- names(averagedPhenoData
)
72 nonTraitNames
<- metaData
73 allTraitNames
<- allNames
[! allNames
%in% nonTraitNames
]
75 rownames(averagedPhenoData
) <- NULL
76 correPhenoData
<- averagedPhenoData
79 correPhenoData
<- phenoData
%>%
81 summarise_if(is
.numeric
, mean
, na
.rm
=TRUE) %>%
88 correInputData
<- correPhenoData
89 print(head(correPhenoData
))
91 } else if (any(grepl("combined_gebvs", inputFiles
)) ||
92 any(grepl("selection_index", inputFiles
))) {
94 correCoefsTableFile
<- grep("genetic_corr_table", outputFiles
, value
=TRUE)
95 correCoefsJsonFile
<- grep("genetic_corr_json", outputFiles
, value
=TRUE)
96 geneticDataFile
<- grep("combined_gebvs", inputFiles
, value
=TRUE)
97 selectionIndexFile
<- grep("selection_index", inputFiles
, value
=TRUE)
99 message('selectionIndexFile', selectionIndexFile
)
100 message('geneticDataFile', geneticDataFile
)
102 geneticData
<- read
.table(geneticDataFile
,
106 na
.strings
= c("NA"),
111 if (length(selectionIndexFile
) != 0
112 && file
.info(selectionIndexFile
)$size
!= 0) {
113 indexData
<- read
.table(selectionIndexFile
,
117 na
.strings
= c("NA"),
124 if (!is
.null(indexData
)) {
125 geneticData
<- rownames_to_column(geneticData
, var
="genotypes")
126 indexData
<- rownames_to_column(indexData
, var
="genotypes")
128 geneticData
<- geneticData
%>% arrange(genotypes
)
129 indexData
<- indexData
%>% arrange(genotypes
)
131 corrData
<- full_join(geneticData
, indexData
)
132 corrData
<- column_to_rownames(corrData
, var
="genotypes")
135 corrData
<- geneticData
138 correInputData
<- corrData
142 if (is
.null(correInputData
)) {
143 stop("Can't run correlation analysis. There is no input data.")
146 coefpvalues
<- rcor
.test(correInputData
,
151 coefs
<- coefpvalues$cor
.mat
152 #remove rows and columns that are all "NA"
153 coefs
<- data
.frame(coefs
)
154 if (any(is
.na(coefs
))) {
155 coefs
<- coefs
[ , colSums(is
.na(coefs
)) < nrow(coefs
)]
158 coefs
[upper
.tri(coefs
)] <- NA
159 pvalues
<- coefpvalues$cor
.mat
160 pvalues
[lower
.tri(pvalues
)] <- coefpvalues$p
.values
[, 3]
161 pvalues
<- round(pvalues
, 3)
162 pvalues
[upper
.tri(pvalues
)] <- NA
163 pvalues
<- data
.frame(pvalues
)
165 allcordata
<- coefpvalues$cor
.mat
166 allcordata
[upper
.tri(allcordata
)] <- coefpvalues$p
.values
[, 3]
167 diag(allcordata
) <- 1
168 allcordata
<- round(allcordata
, 3)
170 traits
<- colnames(coefs
)
172 correlationList
<- list(
178 correlationJson
<- jsonlite
::toJSON(correlationList
)
180 write
.table(allcordata
,
181 file
= correCoefsTableFile
,
187 write(correlationJson
,
188 file
= correCoefsJsonFile
)
190 message("Done running correlation.")
191 q(save
= "no", runLast
= FALSE)