R/solgwas/solgwas_genoPCA_script.R

   1 #install.packages("rrBLUP")
   2 #install.packages("corrplot")
   3 #install.packages("dplyr")
   4 library("methods")
   5 library("dplyr")
   6
   7 ########################################
   8 ##### Read data from temp files #####
   9 ########################################
  10 args = commandArgs(trailingOnly = TRUE)
  11
  12
  13 geno <- read.table(args[1], sep="\t", row.names = 1, header = TRUE)
  14 figure2_file_name <- args[2]
  15
  16
  17 geno[1:5,1:5] ### View genotypic data.
  18 geno$row.names[1:5]
  19 # retain only a single row of genotyped values per each genotype
  20 # (this is necessary because the input genotype table may contain duplicate stock ids - aka germplasmDbIds)
  21 geno_trim <- geno[!duplicated(row.names(geno)),]
  22
  23 dim(geno)
  24 dim(geno_trim)
  25 geno_trim[1:5,1:5]
  26 dim(geno_trim)
  27 dim(geno_trim)
  28 geno_trim[1:5,1:5]
  29 # genotype data is coded as 0,1,2 - convert this to -1,0,1
  30 geno_trim <- geno_trim - 1
  31
  32 ##### Get marker data from marker names in geno file:
  33 #coordinate_matrix <- matrix(nrow = length(colnames(geno_trim)), ncol = 3)
  34 #for (marker in 1:length(colnames(geno_trim))) {
  35 #  current_string = strsplit(colnames(geno_trim)[marker], split='_', fixed=TRUE)
  36 #  coordinate_matrix[marker,1] = colnames(geno_trim)[marker]
  37 #  coordinate_matrix[marker,2] = current_string[[1]][1]
  38 #  coordinate_matrix[marker,3] = current_string[[1]][2]
  39 #}
  40
  41 geno.filtered <- geno_trim
  42 dim(geno.filtered)
  43
  44 ##### The data in the database has already been imputed, but we use the A.mat function here for MAF filtering and to generate the kinship matrix #####
  45 library(rrBLUP)
  46 Imputation <- A.mat(t(geno.filtered),impute.method="EM",return.imputed=T,min.MAF=0.05)
  47
  48 K.mat <- Imputation$A ### KINSHIP matrix
  49 geno.gwas <- Imputation$imputed #NEW geno data.
  50 dim(geno.gwas)
  51
  52 ##### PCA/Population Structure #####
  53 # Centering the data
  54 geno.scale <- scale(geno.gwas,center=T,scale=F)
  55 svdgeno <- svd(geno.scale)
  56 PCA <- geno.scale%*%svdgeno$v #Principal components
  57 PCA[1:5,1:5]
  58 ## Screeplot to visualize the proportion of variance explained by PCA
  59 #
  60 #plot(round((svdgeno$d)^2/sum((svdgeno$d)^2),d=7)[1:10],type="o",main="Screeplot",xlab="PCAs",ylab="% variance")
  61 #
  62 ##Proportion of variance explained by PCA1 and PCA2
  63 PCA1 <- 100*round((svdgeno$d[1])^2/sum((svdgeno$d)^2),d=3); PCA1
  64 PCA2 <- 100*round((svdgeno$d[2])^2/sum((svdgeno$d)^2),d=3); PCA2
  65 ### Plotting Principal components.
  66 png(figure2_file_name)
  67 plot(PCA[,1],PCA[,2],xlab=paste("PC1: ",PCA1,"%",sep=""),ylab=paste("PC2: ",PCA2,"%",sep=""),pch=20,cex=0.7,main="PCA Plot")
  68 dev.off()