3 #runs phenotypic heritability analysis.
4 #Heritability coeffiecients are stored in tabular and json formats
7 # Christiano Simoes (ccs263@cornell.edu)
15 library(phenoAnalysis)
20 allArgs <- commandArgs()
21 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
24 inputFiles <- scan(grep("input_files", allArgs, value = TRUE),
28 refererQtl <- grep("qtl", inputFiles, value=TRUE)
30 phenoDataFile <- grep("\\/phenotype_data", inputFiles, value=TRUE)
31 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, fixed = FALSE, value = TRUE)
32 metadataFile <- grep("metadata", inputFiles, value=TRUE)
34 h2CoefficientsFile <- grep("h2_coefficients_table", outputFiles, value=TRUE)
35 h2CoefficientsJsonFile <- grep("h2_coefficients_json", outputFiles, value=TRUE)
37 formattedPhenoData <- c()
40 phenoData <- as.data.frame(fread(phenoDataFile, sep="\t",
41 na.strings = c("NA", "", "--", "-", ".", "..")
44 metaData <- scan(metadataFile, what="character")
45 cat("Dim phenoData ", dim(phenoData),"\n")
51 if (length(refererQtl) != 0) {
53 allNames <- names(phenoData)
54 nonTraitNames <- c("ID")
55 allTraitNames <- allNames[! allNames %in% nonTraitNames]
58 allNames <- names(phenoData)
59 nonTraitNames <- metaData
60 allTraitNames <- allNames[! allNames %in% nonTraitNames]
69 #Calculating missing data
70 missingData <- apply(phenoData, 2, function(x) sum(is.na(x)))
71 md = data.frame(missingData)
72 rangeTraits <- which(colnames(phenoData) %in% allTraitNames)
74 # Calculating the number of replicates per accession
75 replicateData <- data.frame(replicates = tapply(phenoData$replicate, phenoData$germplasmName, function(x){
76 return(max(unique(x)))
78 replicateData <- tibble::rownames_to_column(replicateData, "germplasmName")
82 #Removing non numeric data
83 for( traits in rangeTraits){
84 if(is.numeric(phenoData[,traits]) == 'FALSE'){
85 phenoData <- phenoData[,-traits]
86 allTraitNames <- allTraitNames[-which(allTraitNames == colnames(phenoData)[traits])]
90 #Range after filtering
91 rangeTraits <- which(names(phenoData)%in%allTraitNames)
93 nTraits <- length(rangeTraits)
94 # Preparing variance vectors
101 for (i in rangeTraits) {
102 outcome = colnames(phenoData)[i]
104 #Calculating missing data per trait
105 missingData <- data.frame(missingData = tapply(phenoData[,outcome], phenoData$germplasmName, function(x){
106 return(length(which(is.na(x))))
109 missingData <- tibble::rownames_to_column(missingData, "germplasmName")
110 missingReplicates <- dplyr::left_join(missingData, replicateData, by = "germplasmName")
111 missingReplicates$limitRep <- missingReplicates$replicates - missingReplicates$missingData
112 missingReplicates <- missingReplicates[missingReplicates$limitRep > 1,]
114 # Filtering the dataset
115 phenoData <- phenoData[phenoData$germplasmName %in% missingReplicates$germplasmName,]
118 print(paste0('outcome ', outcome))
120 model <- lmer(get(outcome)~ (1|germplasmName) + replicate,
121 na.action = na.exclude,
125 # variance = as.data.frame(VarCorr(model))
126 variance = data.frame(VarCorr(model))
128 H2 = variance$vcov[1]/ (variance$vcov[1] + variance$vcov[2])
129 her = append(her, round(as.numeric(H2), digits =3))
130 Vg = append(Vg, round(as.numeric(variance$vcov[1]), digits = 3))
131 Vres = append(Vres, round(as.numeric(variance$vcov[2]), digits = 3))
132 resp_var = append(resp_var, colnames(phenoData)[i])
136 #Prepare information to export data
137 Heritability = data.frame(resp_var,Vg, Vres, her)
139 Heritability = Heritability %>%
148 #remove rows and columns that are all "NA"
149 heritability2json <- function(mat) {
150 mat <- as.list(as.data.frame(t(mat)))
155 traits <- Heritability$trait
157 heritabilityList <- list(
158 "traits" = toJSON(traits),
159 "coeffiecients" =heritability2json(Heritability)
162 heritabilityJson <- paste("{",paste("\"", names(heritabilityList), "\":", heritabilityList, collapse=","), "}")
164 heritabilityJson <- list(heritabilityJson)
167 file = h2CoefficientsFile,
173 fwrite(heritabilityJson,
174 file = h2CoefficientsJsonFile,
181 q(save = "no", runLast = FALSE)