1 #install.packages("rrBLUP")
2 #install.packages("corrplot")
3 #install.packages("dplyr")
7 ########################################
8 ##### Read data from temp files #####
9 ########################################
10 args = commandArgs(trailingOnly = TRUE)
13 geno <- read.table(args[1], sep="\t", row.names = 1, header = TRUE)
14 figure2_file_name <- args[2]
17 geno[1:5,1:5] ### View genotypic data.
19 # retain only a single row of genotyped values per each genotype
20 # (this is necessary because the input genotype table may contain duplicate stock ids - aka germplasmDbIds)
21 geno_trim <- geno[!duplicated(row.names(geno)),]
29 # genotype data is coded as 0,1,2 - convert this to -1,0,1
30 geno_trim <- geno_trim - 1
32 ##### Get marker data from marker names in geno file:
33 #coordinate_matrix <- matrix(nrow = length(colnames(geno_trim)), ncol = 3)
34 #for (marker in 1:length(colnames(geno_trim))) {
35 # current_string = strsplit(colnames(geno_trim)[marker], split='_', fixed=TRUE)
36 # coordinate_matrix[marker,1] = colnames(geno_trim)[marker]
37 # coordinate_matrix[marker,2] = current_string[[1]][1]
38 # coordinate_matrix[marker,3] = current_string[[1]][2]
41 geno.filtered <- geno_trim
44 ##### The data in the database has already been imputed, but we use the A.mat function here for MAF filtering and to generate the kinship matrix #####
46 Imputation <- A.mat(t(geno.filtered),impute.method="EM",return.imputed=T,min.MAF=0.05)
48 K.mat <- Imputation$A ### KINSHIP matrix
49 geno.gwas <- Imputation$imputed #NEW geno data.
52 ##### PCA/Population Structure #####
54 geno.scale <- scale(geno.gwas,center=T,scale=F)
55 svdgeno <- svd(geno.scale)
56 PCA <- geno.scale%*%svdgeno$v #Principal components
58 ## Screeplot to visualize the proportion of variance explained by PCA
60 #plot(round((svdgeno$d)^2/sum((svdgeno$d)^2),d=7)[1:10],type="o",main="Screeplot",xlab="PCAs",ylab="% variance")
62 ##Proportion of variance explained by PCA1 and PCA2
63 PCA1 <- 100*round((svdgeno$d[1])^2/sum((svdgeno$d)^2),d=3); PCA1
64 PCA2 <- 100*round((svdgeno$d[2])^2/sum((svdgeno$d)^2),d=3); PCA2
65 ### Plotting Principal components.
66 png(figure2_file_name)
67 plot(PCA[,1],PCA[,2],xlab=paste("PC1: ",PCA1,"%",sep=""),ylab=paste("PC2: ",PCA2,"%",sep=""),pch=20,cex=0.7,main="PCA Plot")