modified: myjupyterlab.sh
[GalaxyCodeBases.git] / BioInfo / BS-Seq / bwa-meth / bam-merge-pairs.py
blobef6967df961aeb48003e1f632d48301cbf7c5021
1 """
2 In order to avoid double counting of bases in overlapping paired end reads,
3 this script accept mapped reads in SAM format, use the position to determine
4 overlap, and set the lower base-quality of pair overlap to 0.
5 It is assumed that the reads are name-sorted as when they come from the
6 aligner.
8 Usage:
10 bwa mem reference.fa R1.fq R2.fq \
11 | bam-merge-pairs.py \
12 | samtools view -bS - > merged.bam
13 """
14 from __future__ import print_function
15 from __future__ import division
16 import itertools as it
17 from operator import itemgetter
18 from bwameth import Bam
19 import sys
21 line_iter = (x.rstrip("\r\n").split("\t") for x in sys.stdin)
24 for g, pair in it.groupby(line_iter, itemgetter(0)):
25 if g.startswith("@"):
26 print("\n".join("\t".join(line) for line in pair))
27 continue
28 pair = list(pair)
29 if len(pair) == 1:
30 print "\t".join(pair[0])
31 continue
33 assert len(pair) == 2, pair
34 left, right = [Bam(b) for b in pair]
36 # TODO: use left_shift(), right_shift()
37 if left.pos + left.tlen < right.pos:
38 print(str(left))
39 print(str(right))
40 continue
42 # there is overlap
43 # L ------------------>
44 # R <------------------
45 ovl_bases = right.pos, left.pos + left.tlen