5 from toolshed
import nopen
7 def faseq(fa
, chrom
, start
, end
, cache
=[None]):
9 this is called by pileup which is ordered by chrom
10 so we can speed things up by reading in a chrom at
13 if cache
[0] is None or cache
[0][0] != chrom
:
14 seq
= "".join(x
.strip() for i
, x
in
15 enumerate(nopen("|samtools faidx %s %s" % (fa
, chrom
))) if i
>
17 cache
[0] = (chrom
, seq
)
19 return seq
[start
- 1: end
]
21 def get_context(seq5
, forward
):
23 >>> get_context('GACGG', True)
28 assert seq5
[2] == "C", seq5
29 if seq5
[3] == "G": return "CG+"
30 if seq5
[4] == "G": return "CHG+"
32 else: # reverse complement
33 assert seq5
[2] == "G", seq5
34 if seq5
[1] == "C": return "CG-"
35 if seq5
[0] == "C": return "CHG-"
37 except IndexError: # ends of chrom
38 return "CHH" + ["-", "+"][int(forward
)]
40 def tabulate_main(args
):
42 tabulate methylation from bwa-meth.py call
44 p
= argparse
.ArgumentParser(__doc__
)
45 p
.add_argument("--reference", help="reference fasta", required
=True)
46 p
.add_argument("--region", help="specific region in which to run mpileup",
48 p
.add_argument("--g-only", default
=False, action
="store_true",
49 help="only use information from the reverse strand (G->A conversions)")
50 p
.add_argument("--map-q", "-q", type=int, default
=10, help="only tabulate "
51 "methylation for reads with at least this mapping quality")
52 p
.add_argument("--base-q", "-Q", type=int, default
=10, help="only tabulate "
53 "methylation for bases with at least this quality")
54 p
.add_argument("--skip-left", type=int, default
=3, help="don't use the "
55 "first N bases from the start of the read to avoid bias")
56 p
.add_argument("--skip-right", type=int, default
=3, help="don't use the "
57 "last N bases from the end of the read to avoid bias")
58 p
.add_argument("--read-length", type=int, help="length of reads"
59 " before trimming for used with skipping", required
=True)
60 p
.add_argument("--rf", action
='append', default
=[0x2], type=int)
61 p
.add_argument("--ff", action
='append', default
=[0x200, 0x400, 0x800],
63 p
.add_argument("bams", nargs
="+")
65 a
= p
.parse_args(args
)
66 assert os
.path
.exists(a
.reference
)
68 a
.region
= ("-l " if op
.exists(a
.region
) else "-r ") + a
.region
70 for flag
in set(a
.ff
): ff |
= flag
72 for flag
in set(a
.rf
): rf |
= flag
73 cmd
= ("|samtools mpileup --rf {rf} --ff {ff} -Of {reference} {region}"
74 " -d100000 -BQ {base_q} -q {map_q} {bams}")
76 cmd
= cmd
.format(reference
=a
.reference
, map_q
=a
.map_q
, base_q
=a
.base_q
,
77 bams
=" ".join(a
.bams
), region
=a
.region
, rf
=rf
, ff
=ff
)
78 sys
.stderr
.write("generating pileup with command: %s\n" % cmd
)
79 samples
= [op
.basename(b
)[:-4] for b
in a
.bams
]
80 tabulate_methylation(cmd
, a
.reference
, samples
, a
.g_only
, a
.skip_left
,
81 a
.read_length
- a
.skip_right
)
83 def tabulate_methylation(fpileup
, reference
, samples
, g_only
=False,
84 skip_left
=1, skip_right
=99):
86 conversion
= {"C": "T", "G": "A"}
88 for sample
in samples
:
89 fhs
.append(open('%s.methylation.txt' % sample
, 'w'))
90 fhs
[-1].write("#chrom\tpos0\tpos1\tpct\tn_same\tn_converted\n")
93 # regexp to remove the ^, $ stuff
94 end_re
= re
.compile("\^.|\$")
95 deletion_re
= re
.compile("(-\d+[ACTGNatcgn]+)")
97 insertion_re
= re
.compile("\+(\d+)([ACTGNatcgn]+)")
98 deletion_re
= re
.compile("-(\d+)([ACTGNatcgn]+)")
101 """ with a base string like "-1ACTG" just return CTG
102 for -3CCCTG return TG
104 n
= int(m
.groups()[0])
105 bases
= m
.groups()[1]
108 for toks
in (l
.rstrip("\r\n").split("\t") for l
in nopen(fpileup
)):
109 chrom
, pos1
, ref
= toks
[:3]
113 if g_only
and ref
!= "G": continue
114 converted
= conversion
.get(ref
)
115 if converted
is None:
117 s
= faseq(reference
, chrom
, pos1
- 2, pos1
+ 2)
118 ctx
= get_context(s
, ref
== "C")
119 if not ctx
.startswith("CG"): continue
120 for i
, sample
in enumerate(samples
):
121 coverage
, bases
, quals
, posns
= toks
[3 + (i
* 4): 7 + (i
* 4)]
122 if coverage
== '0': continue
126 bases
= end_re
.sub("", bases
) # remove $ and ^.
128 bases
= deletion_re
.sub(del_sub
, bases
)
130 bases
= insertion_re
.sub(del_sub
, bases
)
132 keep
= [skip_left
< int(p
) < skip_right
for p
in posns
.split(",")]
133 if not any(keep
): continue
134 bases
= "".join(b
for j
, b
in enumerate(bases
) if keep
[j
])
135 if bases
== "": continue
137 # . == same on + strand, , == same on - strand
138 n_same_plus
= bases
.count(".")
139 n_same_minus
= bases
.count(",")
140 n_same
= n_same_plus
+ n_same_minus
142 #n_converted_plus = b.count(converted)
143 #n_converted_minus = b.count(converted_lower())
144 #n_converted = n_converted_plus + n_converted_minus
145 n_converted
= bases
.upper().count(converted
)
146 if n_same
+ n_converted
== 0: continue
148 n_other
= sum(1 for b
in bases
.lower() if b
in "actg" and b
!=
150 # weak filtering here for a likely snp.
151 if n_other
> n_converted
: continue
153 pct
= 100. * n_same
/ float(n_same
+ n_converted
)
154 #fhs[i].write("{chrom}\t{pos0}\t{pos1}\t{pct:.1f}\t{n_same}\t{n_converted}\t{ctx}\n".format(**locals()))
155 fhs
[i
].write("{chrom}\t{pos0}\t{pos1}\t{pct:.1f}\t{n_same}\t{n_converted}\n".format(**locals()))
157 if __name__
== "__main__":
158 tabulate_main(sys
.argv
[1:])