R/solgwas/solgwas_genoPCA_script.R

   1 #install.packages("rrBLUP")
   2 #install.packages("corrplot")
   3 #install.packages("dplyr")
   4 library("methods")
   5 library("dplyr")
   6
   7 ########################################
   8 ##### Read data from temp files #####
   9 ########################################
  10 args = commandArgs(trailingOnly = TRUE)
  11
  12
  13 geno <- read.table(args[1], sep="\t", row.names = 1, header = TRUE)
  14 figure2_file_name <- args[2]
  15
  16
  17 geno[1:5,1:5] ### View genotypic data.
  18 geno$row.names[1:5]
  19 # retain only a single row of genotyped values per each genotype
  20 # (this is necessary because the input genotype table may contain duplicate stock ids - aka germplasmDbIds)
  21 geno_trim <- geno[!duplicated(row.names(geno)),]
  22
  23 dim(geno)
  24 dim(geno_trim)
  25 geno_trim[1:5,1:5]
  26 dim(geno_trim)
  27 dim(geno_trim)
  28 geno_trim[1:5,1:5]
  29 # genotype data is coded as 0,1,2 - convert this to -1,0,1
  30 geno_trim <- geno_trim - 1
  31
  32 ##### Get marker data from marker names in geno file:
  33 #coordinate_matrix <- matrix(nrow = length(colnames(geno_trim)), ncol = 3)
  34 #for (marker in 1:length(colnames(geno_trim))) {
  35 #  current_string = strsplit(colnames(geno_trim)[marker], split='_', fixed=TRUE)
  36 #  coordinate_matrix[marker,1] = colnames(geno_trim)[marker]
  37 #  coordinate_matrix[marker,2] = current_string[[1]][1]
  38 #  coordinate_matrix[marker,3] = current_string[[1]][2]
  39 #}
  40
  41 # filter markers that have more than 20% missing because EM imputation will fail
  42 minAccn <- ncol(geno_trim) * (1/5)
  43 geno.filtered <- geno_trim[rowSums(is.na(geno_trim)) <= minAccn,]
  44 dim(geno.filtered)
  45
  46 ##### The data in the database has already been imputed, but we use the A.mat function here for MAF filtering and to generate the kinship matrix #####
  47 library(rrBLUP)
  48 Imputation <- A.mat(t(geno.filtered),impute.method="EM",return.imputed=T,min.MAF=0.05)
  49
  50 K.mat <- Imputation$A ### KINSHIP matrix
  51 geno.gwas <- Imputation$imputed #NEW geno data.
  52 dim(geno.gwas)
  53
  54 ##### PCA/Population Structure #####
  55 # Centering the data
  56 geno.scale <- scale(geno.gwas,center=T,scale=F)
  57 svdgeno <- svd(geno.scale)
  58 PCA <- geno.scale%*%svdgeno$v #Principal components
  59 PCA[1:5,1:5]
  60 ## Screeplot to visualize the proportion of variance explained by PCA
  61 #
  62 #plot(round((svdgeno$d)^2/sum((svdgeno$d)^2),d=7)[1:10],type="o",main="Screeplot",xlab="PCAs",ylab="% variance")
  63 #
  64 ##Proportion of variance explained by PCA1 and PCA2
  65 PCA1 <- 100*round((svdgeno$d[1])^2/sum((svdgeno$d)^2),d=3); PCA1
  66 PCA2 <- 100*round((svdgeno$d[2])^2/sum((svdgeno$d)^2),d=3); PCA2
  67 ### Plotting Principal components.
  68 png(figure2_file_name)
  69 plot(PCA[,1],PCA[,2],xlab=paste("PC1: ",PCA1,"%",sep=""),ylab=paste("PC2: ",PCA2,"%",sep=""),pch=20,cex=0.7,main="PCA Plot")
  70 dev.off()