1 #install.packages("rrBLUP")
2 #install.packages("corrplot")
3 #install.packages("dplyr")
8 ########################################
9 ##### Read data from temp files #####
10 ########################################
11 args = commandArgs(trailingOnly = TRUE)
13 pheno <- read.table(args[1], sep = "\t", header = TRUE, stringsAsFactors = FALSE, check.names = FALSE)
16 #### Current script accepts genotype data with markers as rows and accessions as columns
17 #### But rest of code operates on previous format which had genotype data with markers as columns and accessions as rows
18 #### therefore the geno table is transposed when the A.mat function is called
19 geno <- read.table(args[2], sep="\t", row.names = 1, header = TRUE)
20 study_trait <- args[3]
22 figure3_file_name <- args[4]
23 figure4_file_name <- args[5]
25 kinship_check <- args[7]
31 print("kinship_check:")
35 # Note: still need to test how well this pmatch deals with other trickier cases
36 pheno_names <- names(pheno)
37 pheno_names <- gsub(" ", ".", pheno_names)
38 pheno_vector <- pheno[,pmatch(study_trait, pheno_names)]
40 # Make a new phenotype table, including only the phenotype selected:
41 pheno_mod <- cbind(pheno, pheno_vector)
47 ### only the data for the trait selected....
48 # write.table(pheno_mod, "pheno_mod_temp_file.csv", sep = ",", col.names = TRUE)
49 # Shapiro-Wilk test for normality
50 #shapiro.test(pheno[,18])
52 # retain only a single column of genotyped values per each genotype
53 # (this is necessary because the input genotype table may contain duplicate stock ids - aka germplasmName is used yet - germplasmDbIds)
54 geno_trim <- geno[,!duplicated(colnames(geno))]
55 # genotype data is coded as 0,1,2 - convert this to -1,0,1
56 geno_trim <- geno_trim - 1
58 ##### Get marker data from marker names in geno file:
59 coordinate_matrix <- matrix(nrow = length(row.names(geno_trim)), ncol = 3)
60 for (marker in 1:length(row.names(geno_trim))) {
61 current_string = strsplit(row.names(geno_trim)[marker], split='_', fixed=TRUE)
62 coordinate_matrix[marker,1] = row.names(geno_trim)[marker]
63 coordinate_matrix[marker,2] = current_string[[1]][1]
64 coordinate_matrix[marker,3] = current_string[[1]][2]
67 # Need to revisit filtering... for now, skip....
68 #geno.filtered <- filtering.function(geno,0.4,0.60,0.05)
69 geno.filtered <- geno_trim
72 #geno.filtered[1:5,1:5]
76 ##### The data in the database has already been imputed, but we use the A.mat function here for MAF filtering and to generate the kinship matrix #####
78 ##### transposition of geno.filtered because that is what A.mat expects accessions as rows and markers as columns
79 Imputation <- A.mat(t(geno.filtered),impute.method="EM",return.imputed=T,min.MAF=0.05)
81 K.mat <- Imputation$A ### KINSHIP matrix
82 geno.gwas <- Imputation$imputed #NEW geno data.
87 ##### Work to match phenotyes and genotypes #####
89 # NOTA BENE: Currently extracting unique phenotype values, also need to exclude NAs, I think...
90 # Ultimately, it may be better to take an average? TBD...
92 pheno_mod=pheno_mod[which(pheno_mod$pheno_vector != "NA"),]
93 print("Filtering out NAs...")
95 #pheno_mod <- pheno_mod[!duplicated(pheno_mod$germplasmName),]
96 pheno_mod <- distinct(pheno_mod, germplasmName, .keep_all = TRUE)
97 print("Filtering out duplicated stock IDs, keeping only single row for each stock ID...")
102 pheno_mod=pheno_mod[pheno_mod$germplasmName%in%rownames(geno.gwas),]
103 print("Filtering out stock IDs not in geno matrix...")
105 pheno_mod$germplasmName<-factor(as.character(pheno_mod$germplasmName), levels=rownames(geno.gwas)) #to ensure same levels on both files
106 pheno_mod <- pheno_mod[order(pheno_mod$germplasmName),]
109 ###Creating file for GWAS function from rrBLUP package
110 #pheno_mod$locationDbId<- as.factor(pheno_mod$locationDbId)
111 ## Check the number of levels in the pheno_mod$locationDbId
112 #location_levels <- nlevels(pheno_mod$locationDbId)
113 #print("Number of Levels:")
115 ## Check model.matrix
116 ## Set model.matrix to include locationDbId in the model, but not if this factor has only one level...
117 #if (nlevels(pheno_mod$locationDbId) > 1) {
118 #X<-model.matrix(~-1+locationDbId, data=pheno_mod)
120 X<-model.matrix(~-1, data=pheno_mod)
122 pheno.gwas <- data.frame(GID=pheno_mod$germplasmName,X,PHENO=pheno_mod$pheno_vector)
124 geno.gwas <- geno.gwas[rownames(geno.gwas)%in%pheno.gwas$GID,]
125 pheno.gwas <- pheno.gwas[pheno.gwas$GID%in%rownames(geno.gwas),]
126 geno.gwas <- geno.gwas[rownames(geno.gwas)%in%rownames(K.mat),]
127 K.mat <- K.mat[rownames(K.mat)%in%rownames(geno.gwas),colnames(K.mat)%in%rownames(geno.gwas)]
128 pheno.gwas <- pheno.gwas[pheno.gwas$GID%in%rownames(K.mat),]
132 ##### Match Genotype to the Scaffold & positions extracted above #####
133 # Not necessary (my map is derived directly from geno...):
134 # geno.gwas<-geno.gwas[,match(map$Markers,colnames(geno.gwas))]
136 # geno.gwas <- geno.gwas[,colnames(geno.gwas)%in%map$Markers]
137 coordinate_matrix[1:5,1:3]
138 dim(coordinate_matrix)
139 coordinate_matrix <- coordinate_matrix[coordinate_matrix[,1]%in%colnames(geno.gwas),]
140 dim(coordinate_matrix)
141 geno.gwas2<- data.frame(mark=colnames(geno.gwas),chr=coordinate_matrix[,2],loc=coordinate_matrix[,3],t(geno.gwas))
142 print("Done up to here")
144 colnames(geno.gwas2)[4:ncol(geno.gwas2)] <- rownames(geno.gwas)
149 ##### Run the rrblup GWAS #####
150 # Set plotting to false, do our own plotting
151 # Choose which GWAS analysis to run based on the K-matrix flag and the PC flag:
152 if (kinship_check == 0) {
154 gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=NULL, plot=F, n.PC=0, min.MAF=0.05)
155 print("Run model with no kinship, no pcs")
157 gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=NULL, plot=F, n.PC=6, min.MAF=0.05)
158 print("Run model with no kinship, yes pcs")
162 gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=K.mat, plot=F, n.PC=0, min.MAF=0.05)
163 print("Run model with yes kinship, no pcs")
165 gwasresults<-GWAS(pheno.gwas, geno.gwas2, fixed=NULL, K=K.mat, plot=F, n.PC = 6, min.MAF=0.05)
166 print("Run model with yes kinship, yes pcs")
171 ##### Manhanttan and QQ plots #####
172 # Structure of results:
173 # Fourth column contains the -log10(p-values)
175 alpha_bonferroni=-log10(0.05/length(gwasresults$PHENO))
177 length(gwasresults$PHENO)
180 #chromosome_color_vector <- c("forestgreen","darkblue")[gwasresults$chr]
181 chromosome_ids <- as.factor(gwasresults$chr)
182 marker_indicator <- match(unique(gwasresults$chr), gwasresults$chr)
183 png(figure3_file_name)
184 plot(gwasresults$PHENO,col=chromosome_ids,ylab="-log10(pvalue)",
185 main="Manhattan Plot",xaxt="n",xlab="Position",ylim=c(0,14))
186 axis(1,at=marker_indicator,labels=gwasresults$chr[marker_indicator], cex.axis=0.8, las=2)
187 #axis(1,at=c(1:length(unique(gwasresults$chr))),labels=unique(gwasresults$chr))
188 abline(a=NULL,b=NULL,h=alpha_bonferroni,col="red",lwd=2)
189 #abline(a=NULL,b=NULL,h=alpha_FDR_Yield,col="red",lwd=2,lty=2)
190 legend(1,13.5, c("Bonferroni") ,
191 lty=1, col=c('red', 'blue'), bty='n', cex=1,lwd=2)
195 N <- length(gwasresults$PHENO)
196 expected.logvalues <- sort( -log10( c(1:N) * (1/N) ) )
197 observed.logvalues <- sort( gwasresults$PHENO)
199 png(figure4_file_name)
200 plot(expected.logvalues , observed.logvalues, main="QQ Plot",
201 xlab="Expected -log p-values ",
202 ylab="Observed -log p-values",col.main="black",col="coral1",pch=20)
203 abline(0,1,lwd=3,col="black")
209 ##### Identify the markers that are above the bonferroni cutoff #####
211 which(gwasresults$PHENO>alpha_bonferroni)
212 #which(gwasresults2$PHENO>alpha_bonferroni)
213 #which(gwasresults3$PHENO>alpha_bonferroni)
214 #which(gwasresults4$PHENO>alpha_bonferroni)