Merge pull request #4719 from solgenomics/topic/fixing_histogram
[sgn.git] / R / solGS / kinship.r
blob75d2d2a1b9837c0eeca122f481256aae67b779e8
1 #SNOPSIS
2 #calculates kinship, indbreeding coefficients
4 #AUTHOR
5 # Isaak Y Tecle (iyt2@cornell.edu)
7 options(echo = FALSE)
9 library(methods)
10 library(rrBLUP)
11 library(plyr)
12 library(stringr)
13 #library(lme4)
14 library(randomForest)
15 library(parallel)
16 library(genoDataFilter)
17 library(dplyr)
18 library(tibble)
19 library(rlang)
20 library(jsonlite)
21 library(data.table)
25 allArgs <- commandArgs()
28 inputFiles <- scan(grep("input_files", allArgs, value = TRUE),
29 what = "character")
31 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
32 what = "character")
34 genoData <- c()
36 createGenoData <- function(inputFiles) {
38 genoFiles <- grep("genotype_data", inputFiles, value = TRUE)
40 genoMetaData <- c()
41 filteredGenoFile <- c()
43 if (length(genoFiles) > 1) {
44 genoData <- combineGenoData(genoFiles)
46 genoMetaData <- genoData$trial
47 genoData$trial <- NULL
49 } else {
51 genoFile <- genoFiles
52 genoData <- fread(genoFile,
53 header = TRUE,
54 na.strings = c("NA", " ", "--", "-", "."))
56 if (is.null(genoData)) {
57 filteredGenoFile <- grep("filtered_genotype_data_", genoFile, value = TRUE)
58 genoData <- fread(filteredGenoFile, header = TRUE)
62 genoData <- unique(genoData, by = 'V1')
63 genoData <- data.frame(genoData)
64 genoData <- column_to_rownames(genoData, 'V1')
67 if (is.null(genoData)) {
68 stop("There is no genotype dataset.")
69 q("no", 1, FALSE)
70 } else {
72 ##genoDataFilter::filterGenoData
73 genoData <- convertToNumeric(genoData)
74 genoData <- filterGenoData(genoData, maf=0.01)
75 genoData <- roundAlleleDosage(genoData)
77 message("No. of geno missing values, ", sum(is.na(genoData)))
78 if (sum(is.na(genoData)) > 0) {
79 genoData <- na.roughfix(genoData)
82 genoData <- data.frame(genoData)
86 genoData <- createGenoData(inputFiles)
87 genoData <- genoData[order(row.names(genoData)), ]
89 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
90 genoTrCode <- grep("2", genoData[1, ], value = TRUE)
91 if(length(genoTrCode) != 0) {
92 genoData <- genoData - 1
95 relationshipMatrixFile <- grep("relationship_matrix_adjusted_table", outputFiles, value = TRUE)
96 relationshipMatrixJsonFile <- grep("relationship_matrix_adjusted_json", outputFiles, value = TRUE)
98 message('matrix file ', relationshipMatrixFile)
99 message('json file ', relationshipMatrixJsonFile)
101 inbreedingFile <- grep('inbreeding_coefficients', outputFiles, value=TRUE)
102 aveKinshipFile <- grep('average_kinship', outputFiles, value=TRUE)
104 message('inbreeding file ', inbreedingFile)
105 message('ave file ', aveKinshipFile)
107 relationshipMatrix <- c()
108 inbreeding <- c()
109 aveKinship <- c()
110 relationshipMatrixJson <- c()
113 relationshipMatrix <- A.mat(genoData)
114 diag(relationshipMatrix) <- diag(relationshipMatrix) + 1e-6
115 genos <- rownames(relationshipMatrix)
117 relationshipMatrix <- data.frame(relationshipMatrix)
119 colnames(relationshipMatrix) <- genos
120 rownames(relationshipMatrix) <- genos
122 relationshipMatrix <- relationshipMatrix %>%
123 rownames_to_column('genotypes') %>%
124 mutate_if(is.numeric, round, 3) %>%
125 column_to_rownames('genotypes')
129 inbreeding <- diag(data.matrix(relationshipMatrix))
130 inbreeding <- inbreeding - 1
131 diag(relationshipMatrix) <- inbreeding
133 relationshipMatrix <- relationshipMatrix %>% replace(., . < 0, 0)
134 relationshipMatrix <- relationshipMatrix %>% replace(., . > 1, 0.99)
136 inbreeding <- inbreeding %>% replace(., . < 0, 0)
137 inbreeding <- inbreeding %>% replace(., . > 1, . - 1)
138 inbreeding <- data.frame(inbreeding)
140 inbreeding <- inbreeding %>%
141 rownames_to_column('genotypes') %>%
142 rename(Inbreeding = inbreeding) %>%
143 arrange(Inbreeding) %>%
144 mutate_at('Inbreeding', round, 3) %>%
145 column_to_rownames('genotypes')
148 aveKinship <- data.frame(apply(relationshipMatrix, 1, mean))
150 aveKinship <- aveKinship %>%
151 rownames_to_column('genotypes') %>%
152 rename(Mean_kinship = contains('apply')) %>%
153 arrange(Mean_kinship) %>%
154 mutate_at('Mean_kinship', round, 3) %>%
155 column_to_rownames('genotypes')
157 relationshipMatrixJson <- relationshipMatrix
158 relationshipMatrixJson[upper.tri(relationshipMatrixJson)] <- NA
161 #relationshipMatrixJson <- data.frame(relationshipMatrixJson)
162 relationshipMatrixList <- list(labels = names(relationshipMatrixJson),
163 values = relationshipMatrixJson)
164 relationshipMatrixJson <- jsonlite::toJSON(relationshipMatrixList)
167 #if (file.info(relationshipMatrixFile)$size == 0) {
169 fwrite(relationshipMatrix,
170 file = relationshipMatrixFile,
171 row.names = TRUE,
172 sep = "\t",
173 quote = FALSE,
177 #if (file.info(relationshipMatrixJsonFile)$size == 0) {
179 write(relationshipMatrixJson,
180 file = relationshipMatrixJsonFile,
185 message('inbreedingfile ', inbreedingFile)
186 message('ave file', aveKinshipFile)
187 message('kinshipfile ', relationshipMatrixFile)
188 #if (file.info(inbreedingFile)$size == 0) {
190 fwrite(inbreeding,
191 file = inbreedingFile,
192 row.names = TRUE,
193 sep = "\t",
194 quote = FALSE,
199 #if (file.info(aveKinshipFile)$size == 0) {
201 fwrite(aveKinship,
202 file = aveKinshipFile,
203 row.names = TRUE,
204 sep = "\t",
205 quote = FALSE,
210 message("Done.")
212 q(save = "no", runLast = FALSE)