R/etc/bg.R

   1 #!/usr/bin/env Rscript
   2 # run this in the output directory for rnaseq_pipeline.sh
   3 # passing the pheno data csv file as the only argument
   4 args = commandArgs(trailingOnly=TRUE)
   5 if (length(args)==0) {
   6 # assume no output directory argument was given to rnaseq_pipeline.sh
   7   pheno_data_file <- paste0(getwd(),"/sample.csv")
   8 } else {
   9   pheno_data_file <- args[1]
  10 }
  11
  12 library(ballgown)
  13 library(RSkittleBrewer)
  14 library(genefilter)
  15 library(dplyr)
  16 library(devtools)
  17
  18 ## Read phenotype sample data
  19 pheno_data <- read.csv(pheno_data_file)
  20
  21 ## Read in expression data
  22 out <- ballgown(dataDir = "out", samplePattern= "", pData=pheno_data)
  23
  24 ## Filter low abundance genes
  25 out_filt <- subset(out, "rowVars(texpr(out)) > 1", genomesubset=TRUE)
  26
  27 ## Divided by group
  28 sampleL <- subset(out_filt,"tissue %in% 'L'", genomesubset=FALSE)
  29 sampleB <- subset(out_filt,"tissue %in% 'B'", genomesubset=FALSE)
  30 sampleE <- subset(out_filt,"tissue %in% 'E'", genomesubset=FALSE)
  31
  32 ## Divided by content
  33 LA <- subset(sampleL,"gko %in% 'WT'", genomesubset=FALSE)
  34 LB <- subset(sampleL,"exp %in% 'CD'", genomesubset=FALSE)
  35 LC <- subset(sampleL,"tko %in% c('CD-WT','HFD-KO')", genomesubset=FALSE)
  36 LD <- subset(sampleL,"exp %in% 'HFD'", genomesubset=FALSE)
  37 BA <- subset(sampleB,"gko %in% 'WT'", genomesubset=FALSE)
  38 BB <- subset(sampleB,"exp %in% 'CD'", genomesubset=FALSE)
  39 BC <- subset(sampleB,"tko %in% c('CD-WT','HFD-KO')", genomesubset=FALSE)
  40 BD <- subset(sampleB,"exp %in% 'HFD'", genomesubset=FALSE)
  41 EA <- subset(sampleE,"gko %in% 'WT'", genomesubset=FALSE)
  42 EB <- subset(sampleE,"exp %in% 'CD'", genomesubset=FALSE)
  43 EC <- subset(sampleE,"tko %in% c('CD-WT','HFD-KO')", genomesubset=FALSE)
  44 ED <- subset(sampleE,"exp %in% 'HFD'", genomesubset=FALSE)
  45 LA_data <- subset(pheno_data,tissue=="L"&gko=="WT",genomesubset=FALSE)
  46
  47 WA <- subset(out_filt,"gko %in% 'WT'", genomesubset=FALSE)
  48 WB <- subset(out_filt,"exp %in% 'CD'", genomesubset=FALSE)
  49 WC <- subset(out_filt,"tko %in% c('CD-WT','HFD-KO')", genomesubset=FALSE)
  50 WD <- subset(out_filt,"exp %in% 'HFD'", genomesubset=FALSE)
  51
  52 ## DE by transcript
  53 LAresults_transcripts <-  stattest(LA, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  54 LBresults_transcripts <-  stattest(LB, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  55 LCresults_transcripts <-  stattest(LC, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  56 LDresults_transcripts <-  stattest(LD, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  57 BAresults_transcripts <-  stattest(BA, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  58 BBresults_transcripts <-  stattest(BB, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  59 BCresults_transcripts <-  stattest(BC, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  60 BDresults_transcripts <-  stattest(BD, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  61 EAresults_transcripts <-  stattest(EA, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  62 EBresults_transcripts <-  stattest(EB, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  63 ECresults_transcripts <-  stattest(EC, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  64 EDresults_transcripts <-  stattest(ED, feature='transcript', covariate="tko", getFC=TRUE, meas='FPKM')
  65
  66 WAresults_transcripts <-  stattest(WA, feature='transcript', covariate="tko", adjustvars=c('tissue'), getFC=TRUE, meas='FPKM')
  67 WBresults_transcripts <-  stattest(WB, feature='transcript', covariate="tko", adjustvars=c('tissue'), getFC=TRUE, meas='FPKM')
  68 WCresults_transcripts <-  stattest(WC, feature='transcript', covariate="tko", adjustvars=c('tissue'), getFC=TRUE, meas='FPKM')
  69 WDresults_transcripts <-  stattest(WD, feature='transcript', covariate="tko", adjustvars=c('tissue'), getFC=TRUE, meas='FPKM')
  70
  71 ## DE by gene
  72 LAresults_genes <- stattest(LA, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  73 LBresults_genes <- stattest(LB, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  74 LCresults_genes <- stattest(LC, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  75 LDresults_genes <- stattest(LD, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  76 BAresults_genes <- stattest(BA, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  77 BBresults_genes <- stattest(BB, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  78 BCresults_genes <- stattest(BC, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  79 BDresults_genes <- stattest(BD, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  80 EAresults_genes <- stattest(EA, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  81 EBresults_genes <- stattest(EB, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  82 ECresults_genes <- stattest(EC, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  83 EDresults_genes <- stattest(ED, feature='gene', covariate='tko',getFC=TRUE, meas='FPKM')
  84
  85 ## Add gene name
  86 LAresults_transcripts <- data.frame(chr=LA@expr$trans$chr,strand=LA@expr$trans$strand,start=LA@expr$trans$start,end=LA@expr$trans$end,geneNames=ballgown::geneNames(LA),geneIDs=ballgown::geneIDs(LA), LAresults_transcripts)
  87 LBresults_transcripts <- data.frame(chr=LB@expr$trans$chr,strand=LB@expr$trans$strand,start=LB@expr$trans$start,end=LB@expr$trans$end,geneNames=ballgown::geneNames(LB),geneIDs=ballgown::geneIDs(LB), LBresults_transcripts)
  88 LCresults_transcripts <- data.frame(chr=LC@expr$trans$chr,strand=LC@expr$trans$strand,start=LC@expr$trans$start,end=LC@expr$trans$end,geneNames=ballgown::geneNames(LC),geneIDs=ballgown::geneIDs(LC), LCresults_transcripts)
  89 LDresults_transcripts <- data.frame(chr=LD@expr$trans$chr,strand=LD@expr$trans$strand,start=LD@expr$trans$start,end=LD@expr$trans$end,geneNames=ballgown::geneNames(LD),geneIDs=ballgown::geneIDs(LD), LDresults_transcripts)
  90 BAresults_transcripts <- data.frame(chr=BA@expr$trans$chr,strand=BA@expr$trans$strand,start=BA@expr$trans$start,end=BA@expr$trans$end,geneNames=ballgown::geneNames(BA),geneIDs=ballgown::geneIDs(BA), BAresults_transcripts)
  91 BBresults_transcripts <- data.frame(chr=BB@expr$trans$chr,strand=BB@expr$trans$strand,start=BB@expr$trans$start,end=BB@expr$trans$end,geneNames=ballgown::geneNames(BB),geneIDs=ballgown::geneIDs(BB), BBresults_transcripts)
  92 BCresults_transcripts <- data.frame(chr=BC@expr$trans$chr,strand=BC@expr$trans$strand,start=BC@expr$trans$start,end=BC@expr$trans$end,geneNames=ballgown::geneNames(BC),geneIDs=ballgown::geneIDs(BC), BCresults_transcripts)
  93 BDresults_transcripts <- data.frame(chr=BD@expr$trans$chr,strand=BD@expr$trans$strand,start=BD@expr$trans$start,end=BD@expr$trans$end,geneNames=ballgown::geneNames(BD),geneIDs=ballgown::geneIDs(BD), BDresults_transcripts)
  94 EAresults_transcripts <- data.frame(chr=EA@expr$trans$chr,strand=EA@expr$trans$strand,start=EA@expr$trans$start,end=EA@expr$trans$end,geneNames=ballgown::geneNames(EA),geneIDs=ballgown::geneIDs(EA), EAresults_transcripts)
  95 EBresults_transcripts <- data.frame(chr=EB@expr$trans$chr,strand=EB@expr$trans$strand,start=EB@expr$trans$start,end=EB@expr$trans$end,geneNames=ballgown::geneNames(EB),geneIDs=ballgown::geneIDs(EB), EBresults_transcripts)
  96 ECresults_transcripts <- data.frame(chr=EC@expr$trans$chr,strand=EC@expr$trans$strand,start=EC@expr$trans$start,end=EC@expr$trans$end,geneNames=ballgown::geneNames(EC),geneIDs=ballgown::geneIDs(EC), ECresults_transcripts)
  97 EDresults_transcripts <- data.frame(chr=ED@expr$trans$chr,strand=ED@expr$trans$strand,start=ED@expr$trans$start,end=ED@expr$trans$end,geneNames=ballgown::geneNames(ED),geneIDs=ballgown::geneIDs(ED), EDresults_transcripts)
  98
  99 WAresults_transcripts <- data.frame(chr=WA@expr$trans$chr,strand=WA@expr$trans$strand,start=WA@expr$trans$start,end=WA@expr$trans$end,geneNames=ballgown::geneNames(WA),geneIDs=ballgown::geneIDs(WA), WAresults_transcripts)
 100 WBresults_transcripts <- data.frame(chr=WB@expr$trans$chr,strand=WB@expr$trans$strand,start=WB@expr$trans$start,end=WB@expr$trans$end,geneNames=ballgown::geneNames(WB),geneIDs=ballgown::geneIDs(WB), WBresults_transcripts)
 101 WCresults_transcripts <- data.frame(chr=WC@expr$trans$chr,strand=WC@expr$trans$strand,start=WC@expr$trans$start,end=WC@expr$trans$end,geneNames=ballgown::geneNames(WC),geneIDs=ballgown::geneIDs(WC), WCresults_transcripts)
 102 WDresults_transcripts <- data.frame(chr=WD@expr$trans$chr,strand=WD@expr$trans$strand,start=WD@expr$trans$start,end=WD@expr$trans$end,geneNames=ballgown::geneNames(WD),geneIDs=ballgown::geneIDs(WD), WDresults_transcripts)
 103
 104 ## Sort results from smallest p-value
 105 LAresults_transcripts <- arrange(LAresults_transcripts, pval)
 106 LAresults_genes <-  arrange(LAresults_genes, pval)
 107 LBresults_transcripts <- arrange(LBresults_transcripts, pval)
 108 LBresults_genes <-  arrange(LBresults_genes, pval)
 109 LCresults_transcripts <- arrange(LCresults_transcripts, pval)
 110 LCresults_genes <-  arrange(LCresults_genes, pval)
 111 LDresults_transcripts <- arrange(LDresults_transcripts, pval)
 112 LDresults_genes <-  arrange(LDresults_genes, pval)
 113 BAresults_transcripts <- arrange(BAresults_transcripts, pval)
 114 BAresults_genes <-  arrange(BAresults_genes, pval)
 115 BBresults_transcripts <- arrange(BBresults_transcripts, pval)
 116 BBresults_genes <-  arrange(BBresults_genes, pval)
 117 BCresults_transcripts <- arrange(BCresults_transcripts, pval)
 118 BCresults_genes <-  arrange(BCresults_genes, pval)
 119 BDresults_transcripts <- arrange(BDresults_transcripts, pval)
 120 BDresults_genes <-  arrange(BDresults_genes, pval)
 121 EAresults_transcripts <- arrange(EAresults_transcripts, pval)
 122 EAresults_genes <-  arrange(EAresults_genes, pval)
 123 EBresults_transcripts <- arrange(EBresults_transcripts, pval)
 124 EBresults_genes <-  arrange(EBresults_genes, pval)
 125 ECresults_transcripts <- arrange(ECresults_transcripts, pval)
 126 ECresults_genes <-  arrange(ECresults_genes, pval)
 127 EDresults_transcripts <- arrange(EDresults_transcripts, pval)
 128 EDresults_genes <-  arrange(EDresults_genes, pval)
 129
 130 WAresults_transcripts <- arrange(WAresults_transcripts, pval)
 131 WBresults_transcripts <- arrange(WBresults_transcripts, pval)
 132 WCresults_transcripts <- arrange(WCresults_transcripts, pval)
 133 WDresults_transcripts <- arrange(WDresults_transcripts, pval)
 134
 135 ## Write results to CSV
 136 write.csv(LAresults_transcripts, "LA_transcripts_results.csv", row.names=FALSE)
 137 write.csv(LAresults_genes, "LA_genes_results.csv", row.names=FALSE)
 138 write.csv(LBresults_transcripts, "LB_transcripts_results.csv", row.names=FALSE)
 139 write.csv(LBresults_genes, "LB_genes_results.csv", row.names=FALSE)
 140 write.csv(LCresults_transcripts, "LC_transcripts_results.csv", row.names=FALSE)
 141 write.csv(LCresults_genes, "LC_genes_results.csv", row.names=FALSE)
 142 write.csv(LDresults_transcripts, "LD_transcripts_results.csv", row.names=FALSE)
 143 write.csv(LDresults_genes, "LD_genes_results.csv", row.names=FALSE)
 144 write.csv(BAresults_transcripts, "BA_transcripts_results.csv", row.names=FALSE)
 145 write.csv(BAresults_genes, "BA_genes_results.csv", row.names=FALSE)
 146 write.csv(BBresults_transcripts, "BB_transcripts_results.csv", row.names=FALSE)
 147 write.csv(BBresults_genes, "BB_genes_results.csv", row.names=FALSE)
 148 write.csv(BCresults_transcripts, "BC_transcripts_results.csv", row.names=FALSE)
 149 write.csv(BCresults_genes, "BC_genes_results.csv", row.names=FALSE)
 150 write.csv(BDresults_transcripts, "BD_transcripts_results.csv", row.names=FALSE)
 151 write.csv(BDresults_genes, "BD_genes_results.csv", row.names=FALSE)
 152 write.csv(EAresults_transcripts, "EA_transcripts_results.csv", row.names=FALSE)
 153 write.csv(EAresults_genes, "EA_genes_results.csv", row.names=FALSE)
 154 write.csv(EBresults_transcripts, "EB_transcripts_results.csv", row.names=FALSE)
 155 write.csv(EBresults_genes, "EB_genes_results.csv", row.names=FALSE)
 156 write.csv(ECresults_transcripts, "EC_transcripts_results.csv", row.names=FALSE)
 157 write.csv(ECresults_genes, "EC_genes_results.csv", row.names=FALSE)
 158 write.csv(EDresults_transcripts, "ED_transcripts_results.csv", row.names=FALSE)
 159 write.csv(EDresults_genes, "ED_genes_results.csv", row.names=FALSE)
 160
 161 write.csv(WAresults_transcripts, "WA_transcripts_results.csv", row.names=FALSE)
 162 write.csv(WBresults_transcripts, "WB_transcripts_results.csv", row.names=FALSE)
 163 write.csv(WCresults_transcripts, "WC_transcripts_results.csv", row.names=FALSE)
 164 write.csv(WDresults_transcripts, "WD_transcripts_results.csv", row.names=FALSE)
 165
 166 ## Filter for genes with q-val <0.05
 167 subset(WAresults_transcripts, WAresults_transcripts$qval <=0.05)
 168 #subset(LAresults_genes, LAresults_genes$qval <=0.05)
 169
 170 ## Plotting setup
 171 #tropical <- c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
 172 #palette(tropical)
 173
 174 ## Plotting gene abundance distribution
 175 #fpkm <- texpr(bg_chrX, meas='FPKM')
 176 #fpkm <- log2(fpkm +1)
 177 #boxplot(fpkm, col=as.numeric(pheno_data$sex), las=2,ylab='log2(FPKM+1)')
 178
 179 ## Plot individual transcripts
 180 #ballgown::transcriptNames(bg_chrX)[12]
 181 #plot(fpkm[12,] ~ pheno_data$sex, border=c(1,2),
 182 #     main=paste(ballgown::geneNames(bg_chrX)[12], ' : ',ballgown::transcriptNames(bg_chrX)[12]),
 183 #     pch=19, xlab="Sex", ylab='log2(FPKM+1)')
 184 #points(fpkm[12,] ~ jitter(as.numeric(pheno_data$sex)), col=as.numeric(pheno_data$sex))
 185
 186 ## Plot gene of transcript 1729
 187 #plotTranscripts(ballgown::geneIDs(bg_chrX)[1729], bg_chrX,
 188 #                main=c('Gene XIST in sample ERR188234'), sample=c('ERR188234'))
 189
 190 ## Plot average expression
 191 #plotMeans(ballgown::geneIDs(bg_chrX)[203], bg_chrX_filt, groupvar="sex", legend=FALSE)
 192
 193 ##print gene abundance distribution in your screen
 194 pdf(file="1.Gene_abundance_distribution.pdf",width=15,height=9)
 195 par(mai=c(1.8,1,1,1))
 196 tropical <- c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
 197 palette(tropical)
 198 fpkm <- texpr(out_filt, meas='FPKM')
 199 fpkm <- log2(fpkm +1)
 200 boxplot(fpkm, col=as.numeric(pheno_data$tko), las=2,ylab='log2(FPKM+1)')
 201 dev.off()
 202
 203 ##print individual transcripts
 204 pdf(file="2.Individual_transcripts.pdf")
 205 ballgown::transcriptNames(out_filt)[15]
 206 plot(fpkm[15,] ~ pheno_data$gko, border=c(1,2),main=paste(ballgown::geneNames(out_filt)[15], ' : ',ballgown::transcriptNames(out_filt)[15]),pch=19, xlab="Gko", ylab='log2(FPKM+1)')
 207 points(fpkm[15,] ~ jitter(as.numeric(pheno_data$gko)), col=as.numeric(pheno_data$gko))
 208 dev.off()
 209
 210 ##print Plot gene of transcript
 211 pdf(file="3.Plot_gene_of_transcript.pdf")
 212 plotTranscripts(ballgown::geneIDs(out_filt)[8219], out_filt,main=c('Gene XIST in sample HFD-L14'), sample=c('HFD-L14'))
 213 dev.off()
 214
 215 ##print the compare of average expression
 216 pdf(file="4.Comparison_of_average_expression.pdf")
 217 plotTranscripts(ballgown::geneIDs(out_filt)[8219], out_filt,main=c('Gene XIST in sample HFD-L14'), sample=c('HFD-L14'))
 218 plotMeans('MSTRG.7196', out_filt,groupvar="gko",legend=FALSE)
 219 dev.off()
 220
 221 #print the first picture in sequence
 222 fpkm2 <- texpr(out_filt, meas='FPKM')
 223 tbl_fpkm2<-tbl_df(fpkm2)
 224 colnames(tbl_fpkm2) <- c("CD-B1","CD-B2","CD-B3","CD-B4","CD-B5","CD-E1","CD-E2","CD-E4","CD-E5","CD-KO-B10","CD-KO-B6","CD-KO-B7","CD-KO-B8","CD-KO-E10","CD-KO-E6","CD-KO-E7","CD-KO-E8","CD-KO-E9","CD-KO-L10","CD-KO-L6","CD-KO-L7","CD-KO-L8","CD-KO-L9","CD-L1","CD-L2","CD-L3","CD-L4","CD-L5","HFD-B11","HFD-B12","HFD-B13","HFD-B14","HFD-B15","HFD-E11","HFD-E12","HFD-E13","HFD-E14","HFD-E15","HFD-KO-B16","HFD-KO-B17","HFD-KO-B18","HFD-KO-B19","HFD-KO-E16","HFD-KO-E17","HFD-KO-E18","HFD-KO-E19","HFD-KO-E20","HFD-KO-L16","HFD-KO-L17","HFD-KO-L18","HFD-KO-L19","HFD-KO-L20","HFD-L11","HFD-L12","HFD-L13","HFD-L14","HFD-L15")
 225 newdf <- select(tbl_fpkm2,"CD-L1","CD-L2","CD-L3","CD-L4","CD-L5","CD-KO-L6","CD-KO-L7","CD-KO-L8","CD-KO-L9","CD-KO-L10","HFD-L11","HFD-L12","HFD-L13","HFD-L14","HFD-L15","HFD-KO-L16","HFD-KO-L17","HFD-KO-L18","HFD-KO-L19","HFD-KO-L20","CD-B1","CD-B2","CD-B3","CD-B4","CD-B5","CD-KO-B6","CD-KO-B7","CD-KO-B8","CD-KO-B10","HFD-B11","HFD-B12","HFD-B13","HFD-B14","HFD-B15","HFD-KO-B16","HFD-KO-B17","HFD-KO-B18","HFD-KO-B19","CD-E1","CD-E2","CD-E4","CD-E5","CD-KO-E6","CD-KO-E7","CD-KO-E8","CD-KO-E9","CD-KO-E10","HFD-E11","HFD-E12","HFD-E13","HFD-E14","HFD-E15","HFD-KO-E16","HFD-KO-E17","HFD-KO-E18","HFD-KO-E19","HFD-KO-E20")
 226 pdf(file="5.Gene_abundance_distribution_insequnce.pdf",width=15,height=9)
 227 par(mai=c(1.8,1,1,1))
 228 tropical <- c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
 229 palette(tropical)
 230 newdf <- log2(newdf +1)
 231 boxplot(newdf, col=as.numeric(pheno_data$tko), las=2,ylab='log2(FPKM+1)')
 232 dev.off()
 233