make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / R / heritability / h2_rscript.R
blob71bd2655b12d2d8cdede77d0def772f85e4b9ce1
1  #SNOPSIS
3  #runs phenotypic heritability analysis.
4  #Heritability coeffiecients are stored in tabular and json formats 
6  #AUTHOR
7  # Christiano Simoes (ccs263@cornell.edu)
10 options(echo = FALSE)
12 library(ltm)
13 library(rjson)
14 library(data.table)
15 library(phenoAnalysis)
16 library(dplyr)
17 library(methods)
18 library(na.tools)
20 allArgs <- commandArgs()
21 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
22                     what = "character")
24 inputFiles  <- scan(grep("input_files", allArgs, value = TRUE),
25                     what = "character")
28 refererQtl <- grep("qtl", inputFiles, value=TRUE)
30 phenoDataFile      <- grep("\\/phenotype_data", inputFiles, value=TRUE)
31 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, fixed = FALSE, value = TRUE)
32 metadataFile       <-  grep("metadata", inputFiles, value=TRUE)
34 h2CoefficientsFile     <- grep("h2_coefficients_table", outputFiles, value=TRUE)
35 h2CoefficientsJsonFile <- grep("h2_coefficients_json", outputFiles, value=TRUE)
37 formattedPhenoData <- c()
38 phenoData          <- c()
40 phenoData <- as.data.frame(fread(phenoDataFile, sep="\t",
41                                    na.strings = c("NA", "", "--", "-", ".", "..")
42                                    ))
44 metaData <- scan(metadataFile, what="character")
45 cat("Dim phenoData ", dim(phenoData),"\n")
47 allTraitNames <- c()
48 nonTraitNames <- c()
49 naTraitNames  <- c()
51 if (length(refererQtl) != 0) {
53   allNames      <- names(phenoData)
54   nonTraitNames <- c("ID")
55   allTraitNames <- allNames[! allNames %in% nonTraitNames]
57 } else {
58   allNames <- names(phenoData)
59   nonTraitNames <- metaData
60   allTraitNames <- allNames[! allNames %in% nonTraitNames]
63 print("Trait names:")
64 print(allTraitNames)
66 # print("colnames:")
67 # colnames(phenoData)
69 #Calculating missing data
70 missingData <- apply(phenoData, 2, function(x) sum(is.na(x)))
71 md = data.frame(missingData)
72 rangeTraits <- which(colnames(phenoData) %in% allTraitNames)
74 # Calculating the number of replicates per accession
75 replicateData <- data.frame(replicates = tapply(phenoData$replicate, phenoData$germplasmName, function(x){
76   return(max(unique(x)))
77 }))
78 replicateData <- tibble::rownames_to_column(replicateData, "germplasmName")
82 #Removing non numeric data
83 for( traits in rangeTraits){
84   if(is.numeric(phenoData[,traits]) == 'FALSE'){
85     phenoData <- phenoData[,-traits]
86     allTraitNames <- allTraitNames[-which(allTraitNames == colnames(phenoData)[traits])]
87   }
90 #Range after filtering
91 rangeTraits <- which(names(phenoData)%in%allTraitNames)
93 nTraits <- length(rangeTraits)
94 # Preparing variance vectors
95 her = c()
96 Vg = c()
97 Vres = c()
98 resp_var = c()
100 library(lmerTest)
101 for (i in rangeTraits) {
102   outcome = colnames(phenoData)[i]   
104   #Calculating missing data per trait
105   missingData <- data.frame(missingData = tapply(phenoData[,outcome], phenoData$germplasmName, function(x){
106     return(length(which(is.na(x))))
107   }))
109   missingData <- tibble::rownames_to_column(missingData, "germplasmName")
110   missingReplicates <- dplyr::left_join(missingData, replicateData, by = "germplasmName")
111   missingReplicates$limitRep <- missingReplicates$replicates - missingReplicates$missingData
112   missingReplicates <- missingReplicates[missingReplicates$limitRep > 1,]
114   # Filtering the dataset
115   phenoData <- phenoData[phenoData$germplasmName %in% missingReplicates$germplasmName,]
116    
117   
118   print(paste0('outcome ', outcome))
119   
120   model <- lmer(get(outcome)~ (1|germplasmName) + replicate,
121                 na.action = na.exclude,
122                 data=phenoData)
123   
124   
125   # variance = as.data.frame(VarCorr(model))
126   variance = data.frame(VarCorr(model))
127   
128   H2 = variance$vcov[1]/ (variance$vcov[1] + variance$vcov[2])
129   her = append(her, round(as.numeric(H2), digits =3))
130   Vg = append(Vg, round(as.numeric(variance$vcov[1]), digits = 3))
131   Vres = append(Vres, round(as.numeric(variance$vcov[2]), digits = 3))
132   resp_var = append(resp_var, colnames(phenoData)[i])
136 #Prepare information to export data
137 Heritability = data.frame(resp_var,Vg, Vres, her)
138 print(Heritability)
139 Heritability = Heritability %>% 
140   dplyr::rename(
141     trait = resp_var,
142     Hert = her,
143     Vg = Vg,
144     Vres = Vres
145   )
146 print(Heritability)
148 #remove rows and columns that are all "NA"
149 heritability2json <- function(mat) {
150     mat <- as.list(as.data.frame(t(mat)))
151     names(mat) <- NULL
152     toJSON(mat)
155 traits <- Heritability$trait
157 heritabilityList <- list(
158                      "traits" = toJSON(traits),
159                      "coeffiecients" =heritability2json(Heritability)
160                    )
162 heritabilityJson <- paste("{",paste("\"", names(heritabilityList), "\":", heritabilityList, collapse=","), "}")
164 heritabilityJson <- list(heritabilityJson)
166 fwrite(Heritability,
167        file      = h2CoefficientsFile,
168        row.names = FALSE,
169        sep       = "\t",
170        quote     = FALSE,
171        )
173 fwrite(heritabilityJson,
174        file      = h2CoefficientsJsonFile,
175        col.names = FALSE,
176        row.names = FALSE,
177        qmethod   = "escape"
178        )
181 q(save = "no", runLast = FALSE)