modified: pixi.toml
[GalaxyCodeBases.git] / BioInfo / BS-Seq / bwa-meth / compare / src / target-roc.py
blob17f317173d8f61ebc3d04135b39305e53e343284
1 from __future__ import division
2 import os
3 import os.path as op
4 import sys
5 import tempfile
6 from toolshed import reader
8 import numpy as np
9 from itertools import cycle
10 import pylab as pl
12 def name(bam):
13 return op.basename(bam).rsplit(".", 1)[0].split("-")[0]
15 def counter(fname):
16 fname = fname[0] if not isinstance(fname, basestring) else fname
17 print >>sys.stderr, fname
18 qual_count = [0] * 256
19 for sam_line in (x.rstrip().split("\t") for x in nopen(fname) if not
20 x.startswith('@')):
21 qual = int(sam_line[4])
22 qual_count[qual] += 1
23 # each qual should be the sum of all quals with a lower qual than it
24 return np.cumsum(qual_count[::-1])[::-1]
26 from toolshed import nopen, pmap
28 def count_both(bam, regions, flags):
29 #if not "last" in bam: flags += " -f0x2"
30 off, on = OFF.format(**locals()), ON.format(**locals())
31 return bam, map(counter, [off, on])
33 def count_bam(bam, flags):
34 return int(next(nopen("|samtools view -c {flags} {bam}".format(**locals()))))
36 ON = "| samtools view {bam} -L {regions} {flags}"
37 OFF = "| bedtools intersect -ubam -abam {bam} -b {regions} -wa -v \
38 | samtools view - {flags}"
40 def main(regions, bams, reads=None, flags="-F%i" % (0x100 | 0x4 | 0x200 | 0x400),
41 pad=100):
42 r2 = open(tempfile.mktemp(), 'w')
43 for toks in reader(regions, header=False):
44 if toks[0][0] == "@" or not (toks[1] + toks[2]).isdigit(): continue
45 toks[1] = str(max(0, int(toks[1]) - pad))
46 toks[2] = str(int(toks[2]) + pad)
47 print >>r2, "\t".join(toks)
48 r2.flush()
49 regions = r2.name
50 print reads
51 if reads.isdigit():
52 reads = int(reads)
53 elif reads != "bam":
54 reads = int(nopen("|bioawk -c fastx 'END { print NR }' %s" % reads).next()) * 2.0
56 counts = {}
57 colors = cycle('rgbkmy')
58 bam_reads = {}
60 counts = dict(pmap(count_both, ((bam, regions, flags)
61 for bam in bams)))
63 for bam in bams:
64 nreads = count_bam(bam, flags) if reads == "bam" else reads
65 bam_reads[bam] = nreads
66 symbol = 'o' if len(set(counts[bam][0])) < 3 else '.'
67 pl.plot(counts[bam][0] / float(nreads), counts[bam][1] / float(nreads),
68 '%s%s' % (colors.next(), symbol), label=name(bam))
70 pl.xlabel('off target')
71 pl.ylabel('on target')
72 pl.legend(loc='lower right')
73 pl.xlim(xmin=0)
74 pl.ylim(ymin=0)
75 pl.show()
76 os.unlink(r2.name)
78 out = sys.stdout
79 print >>out, "qual\tmethod\toff\ton"
81 for qual in range(0, 256):
82 for b in bams:
83 print >>out, "{qual}\t{bam}\t{off}\t{on}".format(
84 qual=qual, bam=name(b),
85 off=counts[b][0][qual] / bam_reads[bam],
86 on=counts[b][1][qual] / bam_reads[bam])
87 print >>sys.stderr, "wrote", out.name
89 if __name__ == "__main__":
90 import argparse
91 p = argparse.ArgumentParser()
92 p.add_argument("--reads", help="reads file")
93 p.add_argument("regions")
94 p.add_argument("bams", nargs="+")
96 a = p.parse_args()
98 main(a.regions, a.bams, reads=a.reads)