modified: makesrc3.mk
[GalaxyCodeBases.git] / BioInfo / BS-Seq / bwa-meth / compare / src / sim-roc.py
blob0461faf647720927ee37801dd0075f764fe31ea1
1 import os.path as op
2 import re
3 import sys
4 from toolshed import reader, nopen
5 from collections import defaultdict
7 import numpy as np
8 from itertools import cycle
9 import pylab as pl
10 import seaborn
12 colors = cycle(seaborn.color_palette('Set1', 8))
14 BASES = False
16 def name(bam):
17 return op.basename(bam).rsplit(".", 1)[0].split("-")[0]
18 return op.basename(bam).rsplit(".", 1)[0].split("-")[0] + ("-trim" if "trim" in bam else "")
19 return op.dirname(bam)
21 def count_bases(cigar, patt=re.compile("\d+[IM]")):
22 return sum(int(p[:-1]) for p in patt.findall(cigar))
24 def count_on_off(bam, flags, pad, bases=BASES):
26 #if not "last" in bam: flags = flags + " -f 0x2"
28 on_count = [0] * 256
29 off_count = [0] * 256
30 rcounts = defaultdict(int)
32 posn = re.compile(".*?_(.+):(\d+)-(\d+)")
34 # chr1b:3001315:+__chr1b:3001467:- 99 chr1 3001316 60 100M
35 print >>sys.stderr, bam
36 for toks in reader("|samtools view {flags} {bam}".format(**locals()),
37 header=False):
38 if toks[0][0] == "@": continue
39 if toks[0].endswith(("_R1", "_R2")):
40 toks[0] = toks[0][:-3]
42 cigar = toks[5]
44 qual = int(toks[4])
45 if toks[0].endswith("_BAD"):
46 off_count[qual] += (count_bases(cigar) if bases else 1)
47 continue
49 rname, flag = toks[0], int(toks[1])
50 rcounts[rname] += 1
51 # @2_chr4:72040172-72040686_R1
52 if rcounts[rname] > 2:
53 raise Exception("%s:%s", (bam, rname))
55 try:
56 chrom, start, end = re.match(posn, rname).groups(0)
57 except:
58 print >>sys.stderr, rname
59 raise
61 pos = int(start if flag & 0x40 else end)
62 on = chrom == toks[2] and abs(pos - int(toks[3])) <= pad
63 if on:
64 on_count[qual] += (count_bases(cigar) if bases else 1)
65 else:
66 off_count[qual] += (count_bases(cigar) if bases else 1)
68 on_count = np.cumsum(on_count[::-1])[::-1]
69 off_count = np.cumsum(off_count[::-1])[::-1]
70 return off_count, on_count
72 FLAGS="-F%i" % (0x4 | 0x100 | 0x200)
73 def main(bams, reads=None, flags=FLAGS, pad=12002):
74 if not reads.isdigit():
75 reads = 2 * float(nopen("|bioawk -c fastx 'END { print NR }' %s" % reads).next())
76 else:
77 reads = 2.0 * int(reads)
79 if any('trim' in b for b in bams):
80 assert all('trim' in b for b in bams), [b for b in bams if not 'trim'
81 in b]
82 counts = {}
83 names, pts = [], []
85 denom = float(reads)
86 if BASES: denom *= 100.
87 for bam in bams:
88 counts[bam] = count_on_off(bam, flags, pad)
91 out = sys.stdout
92 print >>out, "qual\tmethod\toff\ton"
94 for qual in range(0, 256):
95 for b in bams:
96 print >>out, "{qual}\t{bam}\t{off}\t{on}".format(
97 qual=qual, bam=name(b),
98 off=counts[b][0][qual] / denom,
99 on=counts[b][1][qual] / denom)
100 print >>sys.stderr, "wrote", out.name
102 for bam in bams:
104 # multiply by 100 bases per read.
105 color = next(colors)
107 symbol = 'o' if len(set(counts[bam][0])) < 3 else '.'
108 nmax = max(i for i, v in enumerate(counts[bam][1]) if v > 0)
109 p, = pl.plot(counts[bam][0][1:nmax + 1] / denom,
110 counts[bam][1][1:nmax + 1] / denom,
111 color=color,
112 mec=color,
113 mfc=color,
114 marker=symbol, label=name(bam))
115 pts.append(p)
116 names.append(name(bam))
117 pl.xlabel('off target')
118 pl.ylabel('on target')
119 pl.legend(pts, names, loc='lower right')
120 pl.xlim(xmin=0)
121 pl.ylim(ymin=0)
122 print >>sys.stderr, reads
124 pl.show()
126 if __name__ == "__main__":
127 import argparse
128 p = argparse.ArgumentParser()
129 p.add_argument("--reads", help="reads file", required=True)
130 p.add_argument("bams", nargs="+")
132 a = p.parse_args()
134 main(a.bams, reads=a.reads)