R/solgwas/solgwas_script.R

   1 #install.packages("rrBLUP")
   2 #install.packages("corrplot")
   3 #install.packages("dplyr")
   4 library("methods")
   5 library("dplyr")
   6
   7
   8 ########################################
   9 ##### Read data from temp files #####
  10 ########################################
  11 args = commandArgs(trailingOnly = TRUE)
  12
  13 pheno <- read.table(args[1], sep = "\t", header = TRUE, stringsAsFactors = FALSE, check.names = FALSE, comment.char = "", quote = "")
  14 colnames(pheno)
  15
  16 #### Current script accepts genotype data with markers as rows and accessions as columns
  17 #### But rest of code operates on previous format which had genotype data with markers as columns and accessions as rows
  18 #### therefore the geno table is transposed when the A.mat function is called
  19 geno <- read.table(args[2], sep="\t", row.names = 1, header = TRUE, check.names = FALSE)
  20 study_trait <- args[3]
  21 study_trait
  22 figure3_file_name <- args[4]
  23 figure4_file_name <- args[5]
  24 pc_check <- args[6]
  25 kinship_check <- args[7]
  26 gwasresultsPhenoCsv <- args[8]
  27
  28
  29
  30 print("pc_check:")
  31 pc_check
  32 print("kinship_check:")
  33 kinship_check
  34
  35 # pheno[1:5,1:21]
  36 # Note: still need to test how well this pmatch deals with other trickier cases
  37 pheno_names <- names(pheno)
  38 pheno_names <- gsub(" ", ".", pheno_names)
  39 pheno_names <- gsub("/", ".", pheno_names)
  40 pheno_names <- gsub("[|]", ".", pheno_names)
  41 pheno_names <- gsub("[%()]", ".", pheno_names)
  42
  43 pheno_vector <- pheno[,pmatch(study_trait, pheno_names)]
  44 pheno_vector[1:5]
  45 # Make a new phenotype table, including only the phenotype selected:
  46 pheno_mod <- cbind(pheno, pheno_vector)
  47 #pheno_vector[1:10]
  48 #pheno_mod[1:5,1:18]
  49 #pheno[1:5,1:21]
  50 colnames(pheno_mod)
  51
  52 ### only the data for the trait selected....
  53 # write.table(pheno_mod, "pheno_mod_temp_file.csv", sep = ",", col.names = TRUE)
  54 # Shapiro-Wilk test for normality
  55 #shapiro.test(pheno[,18])
  56
  57 # retain only a single column of genotyped values per each genotype
  58 # (this is necessary because the input genotype table may contain duplicate stock ids - aka germplasmName is used yet - germplasmDbIds)
  59 mrkData <- geno[,-(1:2)]
  60 geno_trim <- mrkData[,!duplicated(colnames(mrkData))]
  61 # genotype data is coded as 0,1,2 - convert this to -1,0,1
  62 geno_trim <- geno_trim - 1
  63 geno_trim[1:5,1:5]
  64 ##### Get marker data from marker names in geno file:
  65 coordinate_matrix <- matrix(nrow = length(row.names(geno_trim)), ncol = 3)
  66 for (marker in 1:length(row.names(geno_trim))) {
  67   current_string = strsplit(row.names(geno_trim)[marker], split='_', fixed=TRUE)
  68   coordinate_matrix[marker,1] = row.names(geno_trim)[marker]
  69   coordinate_matrix[marker,2] = current_string[[1]][1]
  70   coordinate_matrix[marker,3] = current_string[[1]][2]
  71 }
  72
  73 # filter markers that have more than 20% missing because EM imputation will fail
  74 minAccn <- ncol(geno_trim) * (1/5)
  75 geno.filtered <- geno_trim[rowSums(is.na(geno_trim)) <= minAccn,]
  76 dim(geno.filtered)
  77 #geno.filtered[1:5,1:5]
  78
  79
  80 ##### The data in the database has already been imputed, but we use the A.mat function here for MAF filtering and to generate the kinship matrix #####
  81 library(rrBLUP)
  82 ##### transposition of geno.filtered because that is what A.mat expects accessions as rows and markers as columns
  83 mrkRelMat <- A.mat(t(geno.filtered),return.imputed=FALSE)
  84 Imputation <- A.mat(t(geno.filtered),impute.method="EM",return.imputed=T,min.MAF=0.05)
  85
  86 K.mat <- Imputation$A ### KINSHIP matrix
  87 geno.gwas <- Imputation$imputed #NEW geno data.
  88 dim(geno.gwas)
  89
  90 ##### Work to match phenotyes and genotypes #####
  91 pheno_mod[1:5,1:18]
  92 # NOTA BENE: Currently extracting unique phenotype values, also need to exclude NAs, I think...
  93 # Ultimately, it may be better to take an average? TBD...
  94 dim(pheno_mod)
  95 pheno_mod=pheno_mod[which(pheno_mod$pheno_vector != "NA"),]
  96 print("Filtering out NAs...")
  97 dim(pheno_mod)
  98 #pheno_mod <- pheno_mod[!duplicated(pheno_mod$germplasmName),]
  99 pheno_mod <- distinct(pheno_mod, germplasmName, .keep_all = TRUE)
 100 print("Filtering out duplicated stock IDs, keeping only single row for each stock ID...")
 101 dim(pheno_mod)
 102 rownames(geno.gwas)
 103 colnames(pheno_mod)
 104 colnames(pheno)
 105 pheno_mod=pheno_mod[pheno_mod$germplasmName%in%rownames(geno.gwas),]
 106 print("Filtering out stock IDs not in geno matrix...")
 107 dim(pheno_mod)
 108 pheno_mod$germplasmName<-factor(as.character(pheno_mod$germplasmName), levels=rownames(geno.gwas)) #to ensure same levels on both files
 109 pheno_mod <- pheno_mod[order(pheno_mod$germplasmName),]
 110
 111 ####
 112 ###Creating file for GWAS function from rrBLUP package
 113 #pheno_mod$locationDbId<- as.factor(pheno_mod$locationDbId)
 114 ## Check the number of levels in the pheno_mod$locationDbId
 115 #location_levels <- nlevels(pheno_mod$locationDbId)
 116 #print("Number of Levels:")
 117 #location_levels
 118 ## Check model.matrix
 119 ## Set model.matrix to include locationDbId in the model, but not if this factor has only one level...
 120 #if (nlevels(pheno_mod$locationDbId) > 1) {
 121 #X<-model.matrix(~-1+locationDbId, data=pheno_mod)
 122 #} else {
 123 X<-model.matrix(~-1, data=pheno_mod)
 124 #}
 125 pheno.gwas <- data.frame(GID=pheno_mod$germplasmName,X,PHENO=pheno_mod$pheno_vector)
 126 pheno.gwas[1:5,1:2]
 127 geno.gwas <- geno.gwas[rownames(geno.gwas)%in%pheno.gwas$GID,]
 128 pheno.gwas <- pheno.gwas[pheno.gwas$GID%in%rownames(geno.gwas),]
 129 geno.gwas <- geno.gwas[rownames(geno.gwas)%in%rownames(K.mat),]
 130 K.mat <- K.mat[rownames(K.mat)%in%rownames(geno.gwas),colnames(K.mat)%in%rownames(geno.gwas)]
 131 pheno.gwas <- pheno.gwas[pheno.gwas$GID%in%rownames(K.mat),]
 132 pheno.gwas[1:5,1:2]
 133 geno.gwas[1:5,1:5]
 134
 135 ##### Match Genotype to the Scaffold & positions extracted above #####
 136 # Not necessary (my map is derived directly from geno...):
 137 # geno.gwas<-geno.gwas[,match(map$Markers,colnames(geno.gwas))]
 138 # head(map)
 139 # geno.gwas <- geno.gwas[,colnames(geno.gwas)%in%map$Markers]
 140 coordinate_matrix[1:5,1:3]
 141 dim(coordinate_matrix)
 142 #coordinate_matrix <- coordinate_matrix[coordinate_matrix[,1]%in%colnames(geno.gwas),]
 143 coordinate_matrix <- geno[rownames(geno)%in%colnames(geno.gwas),]
 144 dim(coordinate_matrix)
 145 geno.gwas2<- data.frame(mark=colnames(geno.gwas),chr=coordinate_matrix[,1],loc=coordinate_matrix[,2],t(geno.gwas))
 146 print("Done up to here")
 147 dim(geno.gwas2)
 148 colnames(geno.gwas2)[4:ncol(geno.gwas2)] <- rownames(geno.gwas)
 149 head(pheno.gwas)
 150 geno.gwas2[1:6,1:6]
 151 K.mat[1:5,1:5]
 152
 153 ##### Run the rrblup GWAS #####
 154 # Set plotting to false, do our own plotting
 155 # Choose which GWAS analysis to run based on the K-matrix flag and the PC flag:
 156 if (kinship_check == 0) {
 157    if (pc_check == 0) {
 158       gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=NULL, plot=F, n.PC=0, min.MAF=0.05)
 159       print("Run model with no kinship, no pcs")
 160    } else {
 161       gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=NULL, plot=F, n.PC=6, min.MAF=0.05)
 162       print("Run model with no kinship, yes pcs")
 163    }
 164 } else {
 165   if (pc_check == 0) {
 166      gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=K.mat, plot=F, n.PC=0, min.MAF=0.05)
 167      print("Run model with yes kinship, no pcs")
 168   } else {
 169     gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=K.mat, plot=F, n.PC = 6, min.MAF=0.05)
 170     print("Run model with yes kinship, yes pcs")
 171   }
 172
 173 }
 174
 175 ##### Manhanttan and QQ plots #####
 176 # Structure of results:
 177 # Fourth column contains the -log10(p-values)
 178 str(gwasresults)
 179 alpha_bonferroni=-log10(0.05/length(gwasresults$PHENO))
 180
 181 length(gwasresults$PHENO)
 182 head(gwasresults)
 183 alpha_bonferroni
 184 #chromosome_color_vector <- c("forestgreen","darkblue")[gwasresults$chr]
 185 chromosome_ids <- as.factor(gwasresults$chr)
 186 marker_indicator <- match(unique(gwasresults$chr), gwasresults$chr)
 187 png(figure3_file_name)
 188 plot(gwasresults$PHENO,col=chromosome_ids,ylab="-log10(pvalue)",
 189      main="Manhattan Plot",xaxt="n",xlab="Position",ylim=c(0,14))
 190 axis(1,at=marker_indicator,labels=gwasresults$chr[marker_indicator], cex.axis=0.8, las=2)
 191 #axis(1,at=c(1:length(unique(gwasresults$chr))),labels=unique(gwasresults$chr))
 192 abline(a=NULL,b=NULL,h=alpha_bonferroni,col="red",lwd=2)
 193 #abline(a=NULL,b=NULL,h=alpha_FDR_Yield,col="red",lwd=2,lty=2)
 194 legend(1,13.5, c("Bonferroni") ,
 195        lty=1, col=c('red', 'blue'), bty='n', cex=1,lwd=2)
 196 dev.off()
 197
 198 # write results to csv file only for testing purpose - not for client use
 199 write.csv(gwasresults$PHENO, file = gwasresultsPhenoCsv)
 200
 201 N <- length(gwasresults$PHENO)
 202 expected.logvalues <- sort( -log10( c(1:N) * (1/N) ) )
 203 observed.logvalues <- sort( gwasresults$PHENO)
 204
 205 png(figure4_file_name)
 206 plot(expected.logvalues , observed.logvalues, main="QQ Plot",
 207      xlab="Expected -log p-values ",
 208      ylab="Observed -log p-values",col.main="black",col="coral1",pch=20)
 209 abline(0,1,lwd=3,col="black")
 210 dev.off()
 211
 212
 213
 214
 215 ##### Identify the markers that are above the bonferroni cutoff #####
 216
 217 which(gwasresults$PHENO>alpha_bonferroni)
 218 #which(gwasresults2$PHENO>alpha_bonferroni)
 219 #which(gwasresults3$PHENO>alpha_bonferroni)
 220 #which(gwasresults4$PHENO>alpha_bonferroni)