R/solgwas/solgwas_script.R

   1 #install.packages("rrBLUP")
   2 #install.packages("corrplot")
   3 #install.packages("dplyr")
   4 library("methods")
   5 library("dplyr")
   6
   7
   8 ########################################
   9 ##### Read data from temp files #####
  10 ########################################
  11 args = commandArgs(trailingOnly = TRUE)
  12
  13 pheno <- read.table(args[1], sep = "\t", header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
  14 colnames(pheno)
  15
  16 #### Current script accepts genotype data with markers as rows and accessions as columns
  17 #### But rest of code operates on previous format which had genotype data with markers as columns and accessions as rows
  18 #### therefore the geno table is transposed when the A.mat function is called
  19 geno <- read.table(args[2], sep="\t", row.names = 1, header = TRUE)
  20 study_trait <- args[3]
  21 study_trait
  22 figure3_file_name <- args[4]
  23 figure4_file_name <- args[5]
  24 pc_check <- args[6]
  25 kinship_check <- args[7]
  26
  27
  28
  29 print("pc_check:")
  30 pc_check
  31 print("kinship_check:")
  32 kinship_check
  33
  34 # pheno[1:5,1:21]
  35 # Note: still need to test how well this pmatch deals with other trickier cases
  36 pheno_names <- names(pheno)
  37 pheno_names <- gsub(" ", ".", pheno_names)
  38 pheno_vector <- pheno[,pmatch(study_trait, pheno_names)]
  39 pheno_vector[1:5]
  40 # Make a new phenotype table, including only the phenotype selected:
  41 pheno_mod <- cbind(pheno, pheno_vector)
  42 #pheno_vector[1:10]
  43 #pheno_mod[1:5,1:18]
  44 #pheno[1:5,1:21]
  45 colnames(pheno_mod)
  46
  47 ### only the data for the trait selected....
  48 # write.table(pheno_mod, "pheno_mod_temp_file.csv", sep = ",", col.names = TRUE)
  49 # Shapiro-Wilk test for normality
  50 #shapiro.test(pheno[,18])
  51
  52 # retain only a single column of genotyped values per each genotype
  53 # (this is necessary because the input genotype table may contain duplicate stock ids - aka germplasmName is used yet - germplasmDbIds)
  54 geno_trim <- geno[,!duplicated(colnames(geno))]
  55 # genotype data is coded as 0,1,2 - convert this to -1,0,1
  56 geno_trim <- geno_trim - 1
  57 geno_trim[1:5,1:5]
  58 ##### Get marker data from marker names in geno file:
  59 coordinate_matrix <- matrix(nrow = length(row.names(geno_trim)), ncol = 3)
  60 for (marker in 1:length(row.names(geno_trim))) {
  61   current_string = strsplit(row.names(geno_trim)[marker], split='_', fixed=TRUE)
  62   coordinate_matrix[marker,1] = row.names(geno_trim)[marker]
  63   coordinate_matrix[marker,2] = current_string[[1]][1]
  64   coordinate_matrix[marker,3] = current_string[[1]][2]
  65 }
  66
  67 # Need to revisit filtering... for now, skip....
  68 #geno.filtered <- filtering.function(geno,0.4,0.60,0.05)
  69 geno.filtered <- geno_trim
  70 #dim(geno_trim)
  71 dim(geno.filtered)
  72 #geno.filtered[1:5,1:5]
  73
  74
  75
  76 ##### The data in the database has already been imputed, but we use the A.mat function here for MAF filtering and to generate the kinship matrix #####
  77 library(rrBLUP)
  78 ##### transposition of geno.filtered because that is what A.mat expects accessions as rows and markers as columns
  79 Imputation <- A.mat(t(geno.filtered),impute.method="EM",return.imputed=T,min.MAF=0.05)
  80
  81 K.mat <- Imputation$A ### KINSHIP matrix
  82 geno.gwas <- Imputation$imputed #NEW geno data.
  83 dim(geno.gwas)
  84
  85
  86
  87 ##### Work to match phenotyes and genotypes #####
  88 pheno_mod[1:5,1:18]
  89 # NOTA BENE: Currently extracting unique phenotype values, also need to exclude NAs, I think...
  90 # Ultimately, it may be better to take an average? TBD...
  91 dim(pheno_mod)
  92 pheno_mod=pheno_mod[which(pheno_mod$pheno_vector != "NA"),]
  93 print("Filtering out NAs...")
  94 dim(pheno_mod)
  95 #pheno_mod <- pheno_mod[!duplicated(pheno_mod$germplasmName),]
  96 pheno_mod <- distinct(pheno_mod, germplasmName, .keep_all = TRUE)
  97 print("Filtering out duplicated stock IDs, keeping only single row for each stock ID...")
  98 dim(pheno_mod)
  99 rownames(geno.gwas)
 100 colnames(pheno_mod)
 101 colnames(pheno)
 102 pheno_mod=pheno_mod[pheno_mod$germplasmName%in%rownames(geno.gwas),]
 103 print("Filtering out stock IDs not in geno matrix...")
 104 dim(pheno_mod)
 105 pheno_mod$germplasmName<-factor(as.character(pheno_mod$germplasmName), levels=rownames(geno.gwas)) #to ensure same levels on both files
 106 pheno_mod <- pheno_mod[order(pheno_mod$germplasmName),]
 107
 108 ####
 109 ###Creating file for GWAS function from rrBLUP package
 110 #pheno_mod$locationDbId<- as.factor(pheno_mod$locationDbId)
 111 ## Check the number of levels in the pheno_mod$locationDbId
 112 #location_levels <- nlevels(pheno_mod$locationDbId)
 113 #print("Number of Levels:")
 114 #location_levels
 115 ## Check model.matrix
 116 ## Set model.matrix to include locationDbId in the model, but not if this factor has only one level...
 117 #if (nlevels(pheno_mod$locationDbId) > 1) {
 118 #X<-model.matrix(~-1+locationDbId, data=pheno_mod)
 119 #} else {
 120 X<-model.matrix(~-1, data=pheno_mod)
 121 #}
 122 pheno.gwas <- data.frame(GID=pheno_mod$germplasmName,X,PHENO=pheno_mod$pheno_vector)
 123 pheno.gwas[1:5,1:2]
 124 geno.gwas <- geno.gwas[rownames(geno.gwas)%in%pheno.gwas$GID,]
 125 pheno.gwas <- pheno.gwas[pheno.gwas$GID%in%rownames(geno.gwas),]
 126 geno.gwas <- geno.gwas[rownames(geno.gwas)%in%rownames(K.mat),]
 127 K.mat <- K.mat[rownames(K.mat)%in%rownames(geno.gwas),colnames(K.mat)%in%rownames(geno.gwas)]
 128 pheno.gwas <- pheno.gwas[pheno.gwas$GID%in%rownames(K.mat),]
 129 pheno.gwas[1:5,1:2]
 130 geno.gwas[1:5,1:5]
 131
 132 ##### Match Genotype to the Scaffold & positions extracted above #####
 133 # Not necessary (my map is derived directly from geno...):
 134 # geno.gwas<-geno.gwas[,match(map$Markers,colnames(geno.gwas))]
 135 # head(map)
 136 # geno.gwas <- geno.gwas[,colnames(geno.gwas)%in%map$Markers]
 137 coordinate_matrix[1:5,1:3]
 138 dim(coordinate_matrix)
 139 coordinate_matrix <- coordinate_matrix[coordinate_matrix[,1]%in%colnames(geno.gwas),]
 140 dim(coordinate_matrix)
 141 geno.gwas2<- data.frame(mark=colnames(geno.gwas),chr=coordinate_matrix[,2],loc=coordinate_matrix[,3],t(geno.gwas))
 142 print("Done up to here")
 143 dim(geno.gwas2)
 144 colnames(geno.gwas2)[4:ncol(geno.gwas2)] <- rownames(geno.gwas)
 145 head(pheno.gwas)
 146 geno.gwas2[1:6,1:6]
 147 K.mat[1:5,1:5]
 148
 149 ##### Run the rrblup GWAS #####
 150 # Set plotting to false, do our own plotting
 151 # Choose which GWAS analysis to run based on the K-matrix flag and the PC flag:
 152 if (kinship_check == 0) {
 153    if (pc_check == 0) {
 154       gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=NULL, plot=F, n.PC=0, min.MAF=0.05)
 155       print("Run model with no kinship, no pcs")
 156    } else {
 157       gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=NULL, plot=F, n.PC=6, min.MAF=0.05)
 158       print("Run model with no kinship, yes pcs")
 159    }
 160 } else {
 161   if (pc_check == 0) {
 162      gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=K.mat, plot=F, n.PC=0, min.MAF=0.05)
 163      print("Run model with yes kinship, no pcs")
 164   } else {
 165     gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=K.mat, plot=F, n.PC = 6, min.MAF=0.05)
 166     print("Run model with yes kinship, yes pcs")
 167   }
 168
 169 }
 170
 171 ##### Manhanttan and QQ plots #####
 172 # Structure of results:
 173 # Fourth column contains the -log10(p-values)
 174 str(gwasresults)
 175 alpha_bonferroni=-log10(0.05/length(gwasresults$PHENO))
 176
 177 length(gwasresults$PHENO)
 178 head(gwasresults)
 179 alpha_bonferroni
 180 #chromosome_color_vector <- c("forestgreen","darkblue")[gwasresults$chr]
 181 chromosome_ids <- as.factor(gwasresults$chr)
 182 marker_indicator <- match(unique(gwasresults$chr), gwasresults$chr)
 183 png(figure3_file_name)
 184 plot(gwasresults$PHENO,col=chromosome_ids,ylab="-log10(pvalue)",
 185      main="Manhattan Plot",xaxt="n",xlab="Position",ylim=c(0,14))
 186 axis(1,at=marker_indicator,labels=gwasresults$chr[marker_indicator], cex.axis=0.8, las=2)
 187 #axis(1,at=c(1:length(unique(gwasresults$chr))),labels=unique(gwasresults$chr))
 188 abline(a=NULL,b=NULL,h=alpha_bonferroni,col="red",lwd=2)
 189 #abline(a=NULL,b=NULL,h=alpha_FDR_Yield,col="red",lwd=2,lty=2)
 190 legend(1,13.5, c("Bonferroni") ,
 191        lty=1, col=c('red', 'blue'), bty='n', cex=1,lwd=2)
 192 dev.off()
 193
 194
 195 N <- length(gwasresults$PHENO)
 196 expected.logvalues <- sort( -log10( c(1:N) * (1/N) ) )
 197 observed.logvalues <- sort( gwasresults$PHENO)
 198
 199 png(figure4_file_name)
 200 plot(expected.logvalues , observed.logvalues, main="QQ Plot",
 201      xlab="Expected -log p-values ",
 202      ylab="Observed -log p-values",col.main="black",col="coral1",pch=20)
 203 abline(0,1,lwd=3,col="black")
 204 dev.off()
 205
 206
 207
 208
 209 ##### Identify the markers that are above the bonferroni cutoff #####
 210
 211 which(gwasresults$PHENO>alpha_bonferroni)
 212 #which(gwasresults2$PHENO>alpha_bonferroni)
 213 #which(gwasresults3$PHENO>alpha_bonferroni)
 214 #which(gwasresults4$PHENO>alpha_bonferroni)