modified: myjupyterlab.sh
[GalaxyCodeBases.git] / BioInfo / BS-Seq / bwa-meth / scripts / tabulate-methylation.py
blob4e6c912cafdcdbdf597e180aec7b81a44073e2c5
1 import sys
2 import os
3 import os.path as op
4 import argparse
5 from toolshed import nopen
7 def faseq(fa, chrom, start, end, cache=[None]):
8 """
9 this is called by pileup which is ordered by chrom
10 so we can speed things up by reading in a chrom at
11 a time into memory
12 """
13 if cache[0] is None or cache[0][0] != chrom:
14 seq = "".join(x.strip() for i, x in
15 enumerate(nopen("|samtools faidx %s %s" % (fa, chrom))) if i >
16 0).upper()
17 cache[0] = (chrom, seq)
18 chrom, seq = cache[0]
19 return seq[start - 1: end]
21 def get_context(seq5, forward):
22 """
23 >>> get_context('GACGG', True)
24 'CG+'
25 """
26 try:
27 if forward:
28 assert seq5[2] == "C", seq5
29 if seq5[3] == "G": return "CG+"
30 if seq5[4] == "G": return "CHG+"
31 return "CHH+"
32 else: # reverse complement
33 assert seq5[2] == "G", seq5
34 if seq5[1] == "C": return "CG-"
35 if seq5[0] == "C": return "CHG-"
36 return "CHH-"
37 except IndexError: # ends of chrom
38 return "CHH" + ["-", "+"][int(forward)]
40 def tabulate_main(args):
41 __doc__ = """
42 tabulate methylation from bwa-meth.py call
43 """
44 p = argparse.ArgumentParser(__doc__)
45 p.add_argument("--reference", help="reference fasta", required=True)
46 p.add_argument("--region", help="specific region in which to run mpileup",
47 default="")
48 p.add_argument("--g-only", default=False, action="store_true",
49 help="only use information from the reverse strand (G->A conversions)")
50 p.add_argument("--map-q", "-q", type=int, default=10, help="only tabulate "
51 "methylation for reads with at least this mapping quality")
52 p.add_argument("--base-q", "-Q", type=int, default=10, help="only tabulate "
53 "methylation for bases with at least this quality")
54 p.add_argument("--skip-left", type=int, default=3, help="don't use the "
55 "first N bases from the start of the read to avoid bias")
56 p.add_argument("--skip-right", type=int, default=3, help="don't use the "
57 "last N bases from the end of the read to avoid bias")
58 p.add_argument("--read-length", type=int, help="length of reads"
59 " before trimming for used with skipping", required=True)
60 p.add_argument("--rf", action='append', default=[0x2], type=int)
61 p.add_argument("--ff", action='append', default=[0x200, 0x400, 0x800],
62 type=int)
63 p.add_argument("bams", nargs="+")
65 a = p.parse_args(args)
66 assert os.path.exists(a.reference)
67 if a.region:
68 a.region = ("-l " if op.exists(a.region) else "-r ") + a.region
69 ff = 0
70 for flag in set(a.ff): ff |= flag
71 rf = 0
72 for flag in set(a.rf): rf |= flag
73 cmd = ("|samtools mpileup --rf {rf} --ff {ff} -Of {reference} {region}"
74 " -d100000 -BQ {base_q} -q {map_q} {bams}")
76 cmd = cmd.format(reference=a.reference, map_q=a.map_q, base_q=a.base_q,
77 bams=" ".join(a.bams), region=a.region, rf=rf, ff=ff)
78 sys.stderr.write("generating pileup with command: %s\n" % cmd)
79 samples = [op.basename(b)[:-4] for b in a.bams]
80 tabulate_methylation(cmd, a.reference, samples, a.g_only, a.skip_left,
81 a.read_length - a.skip_right)
83 def tabulate_methylation(fpileup, reference, samples, g_only=False,
84 skip_left=1, skip_right=99):
86 conversion = {"C": "T", "G": "A"}
87 fhs = []
88 for sample in samples:
89 fhs.append(open('%s.methylation.txt' % sample, 'w'))
90 fhs[-1].write("#chrom\tpos0\tpos1\tpct\tn_same\tn_converted\n")
92 import re
93 # regexp to remove the ^, $ stuff
94 end_re = re.compile("\^.|\$")
95 deletion_re = re.compile("(-\d+[ACTGNatcgn]+)")
97 insertion_re = re.compile("\+(\d+)([ACTGNatcgn]+)")
98 deletion_re = re.compile("-(\d+)([ACTGNatcgn]+)")
100 def del_sub(m):
101 """ with a base string like "-1ACTG" just return CTG
102 for -3CCCTG return TG
104 n = int(m.groups()[0])
105 bases = m.groups()[1]
106 return bases[n:]
108 for toks in (l.rstrip("\r\n").split("\t") for l in nopen(fpileup)):
109 chrom, pos1, ref = toks[:3]
110 pos1 = int(pos1)
111 pos0 = pos1 - 1
112 ref = ref.upper()
113 if g_only and ref != "G": continue
114 converted = conversion.get(ref)
115 if converted is None:
116 continue
117 s = faseq(reference, chrom, pos1 - 2, pos1 + 2)
118 ctx = get_context(s, ref == "C")
119 if not ctx.startswith("CG"): continue
120 for i, sample in enumerate(samples):
121 coverage, bases, quals, posns = toks[3 + (i * 4): 7 + (i * 4)]
122 if coverage == '0': continue
123 obases = bases[:]
124 oquals = quals[:]
126 bases = end_re.sub("", bases) # remove $ and ^.
127 ebases = bases
128 bases = deletion_re.sub(del_sub, bases)
129 dbases = bases
130 bases = insertion_re.sub(del_sub, bases)
132 keep = [skip_left < int(p) < skip_right for p in posns.split(",")]
133 if not any(keep): continue
134 bases = "".join(b for j, b in enumerate(bases) if keep[j])
135 if bases == "": continue
137 # . == same on + strand, , == same on - strand
138 n_same_plus = bases.count(".")
139 n_same_minus = bases.count(",")
140 n_same = n_same_plus + n_same_minus
142 #n_converted_plus = b.count(converted)
143 #n_converted_minus = b.count(converted_lower())
144 #n_converted = n_converted_plus + n_converted_minus
145 n_converted = bases.upper().count(converted)
146 if n_same + n_converted == 0: continue
147 # SNP
148 n_other = sum(1 for b in bases.lower() if b in "actg" and b !=
149 converted.lower())
150 # weak filtering here for a likely snp.
151 if n_other > n_converted: continue
153 pct = 100. * n_same / float(n_same + n_converted)
154 #fhs[i].write("{chrom}\t{pos0}\t{pos1}\t{pct:.1f}\t{n_same}\t{n_converted}\t{ctx}\n".format(**locals()))
155 fhs[i].write("{chrom}\t{pos0}\t{pos1}\t{pct:.1f}\t{n_same}\t{n_converted}\n".format(**locals()))
157 if __name__ == "__main__":
158 tabulate_main(sys.argv[1:])