new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / sperm / kmerfreq.r
bloba4e1a7e95cc59a70bc5465cc0fb02a33e81a2e13
1 #!/usr/bin/env r
2 if (is.null(argv) | length(argv)<3) {
3 cat("Usage: kmerfreq.r maxcnt hist_file coverage [scaledmin] [scaledmax]\n")
4 q()
6 testXrange <- as.numeric(argv[1])
7 infile <- argv[2]
8 avgcvg <- as.numeric(argv[3])
9 scaledmin <- as.numeric(argv[4])
10 scaledmax <- as.numeric(argv[5])
12 if(!exists("D__myfit"))
13 source("myfit.R")
15 #inid <- unlist(strsplit(infile,'.',TRUE))[1]
16 inid <- strsplit(infile,'.',TRUE)[[1]][1]
17 TheIDRelshp <- c('Donor','MALBAC Sperm01','MALBAC Sperm02','MALBAC Sperm03','MDA Sperm23','MDA Sperm24','MDA Sperm28','Donor')
18 names(TheIDRelshp) <- c('blood','S1','S2','S3','S23','S24','S28','mlbacDonor')
20 maxXrange <- 25
22 testXrange <- 15
23 infile <- 'blood.fq.k25.gz.hist.fixed'
24 avgcvg <- 3.27885
26 library('vcd')
28 aa=read.table(infile,skip=8)
29 thefreq <- aa[,1]
30 cnt <- as.numeric(aa[,2])
31 theratio <- aa[,3]
33 thedata <- cbind(thefreq,cnt)
34 flag <- thedata[,1] <= testXrange
35 #usedata <- cbind(cnt[flag],thefreq[flag])
36 usedata <- cbind(theratio[flag],thefreq[flag])
38 scaled <- scale2pois(usedata,avgcvg,cntmin=scaledmin,cntmax=scaledmax)
39 scaledata <- scaled$ret
40 scaleres <- myfit(scaledata, par = list(lambda = avgcvg))
41 print('===scaledmyfit===')
42 #print(scaleres)
43 scalesum <- summary(scaleres)
45 scalechi <- chisq.test(cbind(scaleres$observed,scaleres$fitted))
46 print(scalechi)
48 fitres <- goodfit(usedata,type='poisson', par = list(lambda = avgcvg))
49 #fitres <- goodfit(usedata,type='poisson', method = 'MinChisq')
50 #sumres <- summary(fitres)
52 selfres <- myfit(usedata, par = list(lambda = avgcvg))
53 print('===myfit===')
54 #print(selfres)
55 selfsum <- summary(selfres)
56 #print(selfsum)
58 newres<-list(observed = fitres$observed[c(-1,-2)], count = fitres$count[c(-1,-2)], fitted = fitres$fitted[c(-1,-2)],
59 type = "poisson", method = fitres$method, df = fitres$df - 2, par = fitres$par)
60 class(newres) <- "goodfit"
61 print('===goodfit===')
62 #print(newres)
63 sumres <- summary(newres)
64 #print(sumres)
66 dbg1 <- function(x) {
67 arg <- deparse(substitute(x))
68 RVAL <- cbind(x$observed,x$fitted,(x$observed-x$fitted)/x$fitted)
69 #RVAL <- round(RVAL,digits = 6)
70 colnames(RVAL) <- c(paste0(arg,'.ob'),paste0(arg,'.fi'),'(O-E)/E')
71 rownames(RVAL) <- 1+ 1:nrow(RVAL)
72 RVAL
74 print(cbind(dbg1(scaleres),dbg1(newres),dbg1(selfres)))
76 tiff(filename = paste0(infile,".tiff"), compression="lzw", width = 683, height = 683)#, units = "px", pointsize = 52)
77 #par(mar=c(5, 6, 2, 2),ps=20,family='sans')
78 par(mar=c(4.5, 6, 3, 2),ps=26,family='sans')
80 yy <- theratio
81 xx <- thefreq
83 mean2=avgcvg
84 xxx=seq(0,max(xx))
85 zz=dpois(xxx,mean2);
86 maxY=max(zz,yy,zz*selfres$zoom)
87 maxY<-0.32
88 #lines(xx,zz,xlim=c(0,60),type='l');
90 #barplot(yy,xlim=c(0,20),ylim=c(0,0.212),xlab="K-mer count",ylab='Ratio',col='navy',cex.lab=1)
91 plot(xx,yy,type='h',lwd=19,xlim=c(1,maxXrange),ylim=c(0,maxY),
92 main = paste0("Pearson goodness of fit, p=",signif(scalesum[1,3],digits = 6)),
93 xlab="K-mer count",ylab='Ratio',col='navy',cex.lab=1)
94 # dpois(3,3.62)=0.2117524
95 legend("topright",pch=-1,lty=1,col=c('navy','green3','red'), x.intersp = 1, y.intersp = 2,cex=1,lwd=c(11,7,8),
96 legend= c( TheIDRelshp[inid],paste0("lambda=",signif(avgcvg,digits=6)),
97 paste0("scaledby=",signif(scaled$zoom, digits=6)) )
99 lines(scaleres$count,scaleres$observed,xlim=c(0,60),type='h',col=rgb(1,1,0,1),lwd=11)
101 axis(1, at = seq(0, maxXrange, by = 5),lwd=3,cex=1)
102 lines(xxx,zz,xlim=c(0,maxXrange),type='l',col=rgb(0,0.804,0,0.8),lwd=7)
103 #lines(xx,yy*scaled$zoom,xlim=c(0,60),type='l',col=rgb(1,0,0,0.7),lwd=3)
104 #lines(scaleres$count,scaleres$observed,xlim=c(0,60),type='l',col=rgb(1,0,0,0.9),lwd=5)
106 lines(xx,yy*scaled$zoom,xlim=c(0,60),type='h',col=rgb(1,0,0,1),lwd=7)
107 dev.off()
109 print(selfsum)
110 outline <- paste0(TheIDRelshp[inid],':[',scaled$cntmin,',',scaled$cntmax,"] cvg=",avgcvg," scaleR=",scaled$zoom)
111 print(outline)
112 write(outline,paste0(infile,".txt"),append=FALSE)
114 outline <- paste0('X^2= ',scalesum[1,1],', df= ',scalesum[1,2],', p= ',scalesum[1,3])
115 print(outline)
116 write(outline,paste0(infile,".txt"),append=TRUE)
118 print(scalesum)
121 #./kmerfreq.r mlbacDonor.k25.lz4.hist 6.85
122 #./kmerfreq.r mdaS23.k25.lz4.hist 4.34
123 #./kmerfreq.r S01.k25.lz4.hist 3.62
125 # X^2 df P(> X^2)
126 #Pearson 1.56832025 18 0.9999998
127 #Likelihood Ratio -0.06964308 18 1.0000000
129 # X^2 df P(> X^2)
130 #Pearson 7285.4140548 18 0
131 #Likelihood Ratio 0.3987449 18 1
133 # X^2 df P(> X^2)
134 #Pearson 2.511278e+05 18 0.0000000
135 #Likelihood Ratio 2.540676e+00 18 0.9999924
137 ./kmerfreq.r 15 mlbacDonor.k25.lz4.hist.fixed 5.67811
138 ./kmerfreq.r 15 S1.fq.k25.gz.hist.fixed 3.29571
139 ./kmerfreq.r 15 S2.fq.k25.gz.hist.fixed 3.28174
140 ./kmerfreq.r 15 S3.fq.k25.gz.hist.fixed 2.94459
141 ./kmerfreq.r 15 S23.fq.k25.gz.hist.fixed 3.26044
142 ./kmerfreq.r 15 S24.fq.k25.gz.hist.fixed 3.27885
143 ./kmerfreq.r 15 S28.fq.k25.gz.hist.fixed 3.30884
144 ./kmerfreq.r 15 blood.fq.k25.gz.hist.fixed 3.27885