BioInfo/BS-Seq/bwa-meth/bwameth.py

   1 #!/usr/bin/env python
   2 """
   3 map bisulfite converted reads to an insilico converted genome using bwa mem.
   4 A command to this program like:
   5
   6     python bwameth.py --reference ref.fa A.fq B.fq
   7
   8 Gets converted to:
   9
  10     bwa mem -pCMR ref.fa.bwameth.c2t '<python bwameth.py c2t A.fq B.fq'
  11
  12 So that A.fq has C's converted to T's and B.fq has G's converted to A's
  13 and both are streamed directly to the aligner without a temporary file.
  14 The output is a corrected, sorted, indexed BAM.
  15 """
  16 from __future__ import print_function
  17 import tempfile
  18 import sys
  19 import os
  20 import os.path as op
  21 import argparse
  22 from subprocess import check_call
  23 from operator import itemgetter
  24 from itertools import groupby, repeat, chain
  25 import re
  26
  27 try:
  28     from itertools import izip
  29     import string
  30     maketrans = string.maketrans
  31 except ImportError: # python3
  32     izip = zip
  33     maketrans = str.maketrans
  34 from toolshed import nopen, reader, is_newer_b
  35
  36 __version__ = "0.2.0"
  37
  38 def checkX(cmd):
  39     for p in os.environ['PATH'].split(":"):
  40         if os.access(os.path.join(p, cmd), os.X_OK):
  41             break
  42     else:
  43         raise Exception("executable for '%s' not found" % cmd)
  44
  45 checkX('samtools')
  46 checkX('bwa')
  47
  48 class BWAMethException(Exception): pass
  49
  50 def comp(s, _comp=maketrans('ATCG', 'TAGC')):
  51     return s.translate(_comp)
  52
  53 def wrap(text, width=100): # much faster than textwrap
  54     try: xrange
  55     except NameError: xrange = range
  56     for s in xrange(0, len(text), width):
  57         yield text[s:s+width]
  58
  59 def run(cmd):
  60     list(nopen("|%s" % cmd.lstrip("|")))
  61
  62 def fasta_iter(fasta_name):
  63     fh = nopen(fasta_name)
  64     faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
  65     for header in faiter:
  66         header = next(header)[1:].strip()
  67         yield header, "".join(s.strip() for s in next(faiter)).upper()
  68
  69 def convert_reads(fq1s, fq2s, out=sys.stdout):
  70
  71     for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")):
  72         sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2))
  73         fq1 = nopen(fq1)
  74         if fq2 != "NA":
  75             fq2 = nopen(fq2)
  76             q2_iter = izip(*[fq2] * 4)
  77         else:
  78             sys.stderr.write("WARNING: running bwameth in single-end mode\n")
  79             q2_iter = repeat((None, None, None, None))
  80         q1_iter = izip(*[fq1] * 4)
  81
  82         lt80 = 0
  83         for pair in izip(q1_iter, q2_iter):
  84             for read_i, (name, seq, _, qual) in enumerate(pair):
  85                 if name is None: continue
  86                 name = name.rstrip("\r\n").split(" ")[0]
  87                 if name[0] != "@":
  88                     sys.stderr.write("""ERROR!!!!
  89 ERROR!!! FASTQ conversion failed
  90 ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@"
  91 """ % name)
  92                     sys.exit(1)
  93                 if name.endswith(("_R1", "_R2")):
  94                     name = name[:-3]
  95                 elif name.endswith(("/1", "/2")):
  96                     name = name[:-2]
  97
  98                 seq = seq.upper().rstrip('\n')
  99                 if len(seq) < 80:
 100                     lt80 += 1
 101
 102                 char_a, char_b = ['CT', 'GA'][read_i]
 103                 # keep original sequence as name.
 104                 name = " ".join((name,
 105                                 "YS:Z:" + seq +
 106                                 "\tYC:Z:" + char_a + char_b + '\n'))
 107                 seq = seq.replace(char_a, char_b)
 108                 out.write("".join((name, seq, "\n+\n", qual)))
 109
 110     out.flush()
 111     if lt80 > 50:
 112         sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80)
 113         sys.stderr.write("       : this program is designed for long reads\n")
 114     return 0
 115
 116 def convert_fasta(ref_fasta, just_name=False):
 117     out_fa = ref_fasta + ".bwameth.c2t"
 118     if just_name:
 119         return out_fa
 120     msg = "c2t in %s to %s" % (ref_fasta, out_fa)
 121     if is_newer_b(ref_fasta, out_fa):
 122         sys.stderr.write("already converted: %s\n" % msg)
 123         return out_fa
 124     sys.stderr.write("converting %s\n" % msg)
 125     try:
 126         fh = open(out_fa, "w")
 127         for header, seq in fasta_iter(ref_fasta):
 128             ########### Reverse ######################
 129             fh.write(">r%s\n" % header)
 130
 131             #if non_cpg_only:
 132             #    for ctx in "TAG": # use "ATC" for fwd
 133             #        seq = seq.replace('G' + ctx, "A" + ctx)
 134             #    for line in wrap(seq):
 135             #        print >>fh, line
 136             #else:
 137             for line in wrap(seq.replace("G", "A")):
 138                 fh.write(line + '\n')
 139
 140             ########### Forward ######################
 141             fh.write(">f%s\n" % header)
 142             for line in wrap(seq.replace("C", "T")):
 143                 fh.write(line + '\n')
 144         fh.close()
 145     except:
 146         try:
 147             fh.close()
 148         except UnboundLocalError:
 149             pass
 150         os.unlink(out_fa)
 151         raise
 152     return out_fa
 153
 154
 155 def bwa_index(fa):
 156     if is_newer_b(fa, (fa + '.amb', fa + '.sa')):
 157         return
 158     sys.stderr.write("indexing: %s\n" % fa)
 159     try:
 160         run("bwa index -a bwtsw %s" % fa)
 161     except:
 162         if op.exists(fa + ".amb"):
 163             os.unlink(fa + ".amb")
 164         raise
 165
 166 class Bam(object):
 167     __slots__ = 'read flag chrom pos mapq cigar chrom_mate pos_mate tlen \
 168             seq qual other'.split()
 169     def __init__(self, args):
 170         for a, v in zip(self.__slots__[:11], args):
 171             setattr(self, a, v)
 172         self.other = args[11:]
 173         self.flag = int(self.flag)
 174         self.pos = int(self.pos)
 175         self.tlen = int(float(self.tlen))
 176
 177     def __repr__(self):
 178         return "Bam({chr}:{start}:{read}".format(chr=self.chrom,
 179                                                  start=self.pos,
 180                                                  read=self.read)
 181
 182     def __str__(self):
 183         return "\t".join(str(getattr(self, s)) for s in self.__slots__[:11]) \
 184                          + "\t" + "\t".join(self.other)
 185
 186     def is_first_read(self):
 187         return bool(self.flag & 0x40)
 188
 189     def is_second_read(self):
 190         return bool(self.flag & 0x80)
 191
 192     def is_plus_read(self):
 193         return not (self.flag & 0x10)
 194
 195     def is_minus_read(self):
 196         return bool(self.flag & 0x10)
 197
 198     def is_mapped(self):
 199         return not (self.flag & 0x4)
 200
 201     def cigs(self):
 202         if self.cigar == "*":
 203             yield (0, None)
 204             raise StopIteration
 205         cig_iter = groupby(self.cigar, lambda c: c.isdigit())
 206         for g, n in cig_iter:
 207             yield int("".join(n)), "".join(next(cig_iter)[1])
 208
 209     def cig_len(self):
 210         return sum(c[0] for c in self.cigs() if c[1] in
 211                    ("M", "D", "N", "EQ", "X", "P"))
 212
 213     def left_shift(self):
 214         left = 0
 215         for n, cig in self.cigs():
 216             if cig == "M": break
 217             if cig == "H":
 218                 left += n
 219         return left
 220
 221     def right_shift(self):
 222         right = 0
 223         for n, cig in reversed(list(self.cigs())):
 224             if cig == "M": break
 225             if cig == "H":
 226                 right += n
 227         return -right or None
 228
 229     @property
 230     def original_seq(self):
 231         try:
 232             return next(x for x in self.other if x.startswith("YS:Z:"))[5:]
 233         except:
 234             sys.stderr.write(repr(self.other) + "\n")
 235             sys.stderr.write(self.read + "\n")
 236             raise
 237
 238     @property
 239     def ga_ct(self):
 240         return [x for x in self.other if x.startswith("YC:Z:")]
 241
 242     def longest_match(self, patt=re.compile("\d+M")):
 243         return max(int(x[:-1]) for x in patt.findall(self.cigar))
 244
 245
 246 def rname(fq1, fq2=""):
 247     fq1, fq2 = fq1.split(",")[0], fq2.split(",")[0]
 248     def name(f):
 249         n = op.basename(op.splitext(f)[0])
 250         if n.endswith('.fastq'): n = n[:-6]
 251         if n.endswith(('.fq', '.r1', '.r2')): n = n[:-3]
 252         return n
 253     return "".join(a for a, b in zip(name(fq1), name(fq2)) if a == b) or 'bm'
 254
 255
 256 def bwa_mem(fa, mfq, extra_args, threads=1, rg=None,
 257             paired=True, set_as_failed=None):
 258     conv_fa = convert_fasta(fa, just_name=True)
 259     if not is_newer_b(conv_fa, (conv_fa + '.amb', conv_fa + '.sa')):
 260         raise BWAMethException("first run bwameth.py index %s" % fa)
 261
 262     if not rg is None and not rg.startswith('@RG'):
 263         rg = '@RG\tID:{rg}\tSM:{rg}'.format(rg=rg)
 264
 265     # penalize clipping and unpaired. lower penalty on mismatches (-B)
 266     cmd = "|bwa mem -T 40 -B 2 -L 10 -CM "
 267
 268     if paired:
 269         cmd += ("-U 100 -p ")
 270     cmd += "-R '{rg}' -t {threads} {extra_args} {conv_fa} {mfq}"
 271     cmd = cmd.format(**locals())
 272     sys.stderr.write("running: %s\n" % cmd.lstrip("|"))
 273     as_bam(cmd, fa, set_as_failed)
 274
 275
 276 def as_bam(pfile, fa, set_as_failed=None):
 277     """
 278     pfile: either a file or a |process to generate sam output
 279     fa: the reference fasta
 280     set_as_failed: None, 'f', or 'r'. If 'f'. Reads mapping to that strand
 281                       are given the sam flag of a failed QC alignment (0x200).
 282     """
 283     sam_iter = nopen(pfile)
 284
 285     for line in sam_iter:
 286         if not line[0] == "@": break
 287         handle_header(line)
 288     else:
 289         sys.stderr.flush()
 290         raise Exception("bad or empty fastqs")
 291     sam_iter2 = (x.rstrip().split("\t") for x in chain([line], sam_iter))
 292     for read_name, pair_list in groupby(sam_iter2, itemgetter(0)):
 293         pair_list = [Bam(toks) for toks in pair_list]
 294
 295         for aln in handle_reads(pair_list, set_as_failed):
 296             sys.stdout.write(str(aln) + '\n')
 297
 298 def handle_header(line, out=sys.stdout):
 299     toks = line.rstrip().split("\t")
 300     if toks[0].startswith("@SQ"):
 301         sq, sn, ln = toks  # @SQ    SN:fchr11    LN:122082543
 302         # we have f and r, only print out f
 303         chrom = sn.split(":")[1]
 304         if chrom.startswith('r'): return
 305         chrom = chrom[1:]
 306         toks = ["%s\tSN:%s\t%s" % (sq, chrom, ln)]
 307     if toks[0].startswith("@PG"):
 308         #out.write("\t".join(toks) + "\n")
 309         toks = ["@PG\tID:bwa-meth\tPN:bwa-meth\tVN:%s\tCL:\"%s\"" % (
 310                          __version__,
 311                          " ".join(x.replace("\t", "\\t") for x in sys.argv))]
 312     out.write("\t".join(toks) + "\n")
 313
 314
 315 def handle_reads(alns, set_as_failed):
 316
 317     for aln in alns:
 318         orig_seq = aln.original_seq
 319         assert len(aln.seq) == len(aln.qual), aln.read
 320         # don't need this any more.
 321         aln.other = [x for x in aln.other if not x.startswith('YS:Z')]
 322
 323         # first letter of chrom is 'f' or 'r'
 324         direction = aln.chrom[0]
 325         aln.chrom = aln.chrom.lstrip('fr')
 326
 327         if not aln.is_mapped():
 328             aln.seq = orig_seq
 329             continue
 330
 331         assert direction in 'fr', (direction, aln)
 332         aln.other.append('YD:Z:' + direction)
 333
 334         if set_as_failed == direction:
 335             aln.flag |= 0x200
 336
 337         # here we have a heuristic that if the longest match is not 44% of the
 338         # sequence length, we mark it as failed QC and un-pair it. At the end
 339         # of the loop we set all members of this pair to be unmapped
 340         if aln.longest_match() < (len(orig_seq) * 0.44):
 341             aln.flag |= 0x200  # fail qc
 342             aln.flag &= (~0x2) # un-pair
 343             aln.mapq = min(int(aln.mapq), 1)
 344
 345         mate_direction = aln.chrom_mate[0]
 346         if mate_direction not in "*=":
 347             aln.chrom_mate = aln.chrom_mate[1:]
 348
 349         # adjust the original seq to the cigar
 350         l, r = aln.left_shift(), aln.right_shift()
 351         if aln.is_plus_read():
 352             aln.seq = orig_seq[l:r]
 353         else:
 354             aln.seq = comp(orig_seq[::-1][l:r])
 355
 356     if any(aln.flag & 0x200 for aln in alns):
 357         for aln in alns:
 358             aln.flag |= 0x200
 359             aln.flag &= (~0x2)
 360     return alns
 361
 362 def cnvs_main(args):
 363     __doc__ = """
 364     calculate CNVs from BS-Seq bams or vcfs
 365     """
 366     p = argparse.ArgumentParser(__doc__)
 367     p.add_argument("--regions", help="optional target regions", default='NA')
 368     p.add_argument("bams", nargs="+")
 369
 370     a = p.parse_args(args)
 371     r_script = """
 372 options(stringsAsFactors=FALSE)
 373 suppressPackageStartupMessages(library(cn.mops))
 374 suppressPackageStartupMessages(library(snow))
 375 args = commandArgs(TRUE)
 376 regions = args[1]
 377 bams = args[2:length(args)]
 378 n = length(bams)
 379 if(is.na(regions)){
 380     bam_counts = getReadCountsFromBAM(bams, parallel=min(n, 4), mode="paired")
 381     res = cn.mops(bam_counts, parallel=min(n, 4), priorImpact=20)
 382 } else {
 383     segments = read.delim(regions, header=FALSE)
 384     gr = GRanges(segments[,1], IRanges(segments[,2], segments[,3]))
 385     bam_counts = getSegmentReadCountsFromBAM(bams, GR=gr, mode="paired", parallel=min(n, 4))
 386     res = exomecn.mops(bam_counts, parallel=min(n, 4), priorImpact=20)
 387 }
 388 res = calcIntegerCopyNumbers(res)
 389
 390 df = as.data.frame(cnvs(res))
 391 write.table(df, row.names=FALSE, quote=FALSE, sep="\t")
 392 """
 393     with tempfile.NamedTemporaryFile(delete=True) as rfh:
 394         rfh.write(r_script + '\n')
 395         rfh.flush()
 396         for d in reader('|Rscript {rs_name} {regions} {bams}'.format(
 397             rs_name=rfh.name, regions=a.regions, bams=" ".join(a.bams)),
 398             header=False):
 399             print("\t".join(d))
 400
 401
 402 def convert_fqs(fqs):
 403     script = __file__
 404     return "'<%s %s c2t %s %s'" % (sys.executable, script, fqs[0],
 405                fqs[1] if len(fqs) > 1
 406                       else ','.join(['NA'] * len(fqs[0].split(","))))
 407
 408 def main(args=sys.argv[1:]):
 409
 410     if len(args) > 0 and args[0] == "index":
 411         assert len(args) == 2, ("must specify fasta as 2nd argument")
 412         sys.exit(bwa_index(convert_fasta(args[1])))
 413
 414     if len(args) > 0 and args[0] == "c2t":
 415         sys.exit(convert_reads(args[1], args[2]))
 416
 417     if len(args) > 0 and args[0] == "cnvs":
 418         sys.exit(cnvs_main(args[1:]))
 419
 420     p = argparse.ArgumentParser(__doc__)
 421     p.add_argument("--reference", help="reference fasta", required=True)
 422     p.add_argument("-t", "--threads", type=int, default=6)
 423     p.add_argument("--read-group", help="read-group to add to bam in same"
 424             " format as to bwa: '@RG\\tID:foo\\tSM:bar'")
 425     p.add_argument('--set-as-failed', help="flag alignments to this strand"
 426             " as not passing QC (0x200). Targetted BS-Seq libraries are often"
 427             " to a single strand, so we can flag them as QC failures. Note"
 428             " f == OT, r == OB. Likely, this will be 'f' as we will expect"
 429             " reads to align to the original-bottom (OB) strand and will flag"
 430             " as failed those aligning to the forward, or original top (OT).",
 431         default=None, choices=('f', 'r'))
 432     p.add_argument('--version', action='version', version='bwa-meth.py {}'.format(__version__))
 433
 434     p.add_argument("fastqs", nargs="+", help="bs-seq fastqs to align. Run"
 435             "multiple sets separated by commas, e.g. ... a_R1.fastq,b_R1.fastq"
 436             " a_R2.fastq,b_R2.fastq note that the order must be maintained.")
 437
 438     args, pass_through_args = p.parse_known_args(args)
 439
 440     # for the 2nd file. use G => A and bwa's support for streaming.
 441     conv_fqs_cmd = convert_fqs(args.fastqs)
 442
 443     bwa_mem(args.reference, conv_fqs_cmd, ' '.join(map(str, pass_through_args)),
 444              threads=args.threads, rg=args.read_group or
 445              rname(*args.fastqs),
 446              paired=len(args.fastqs) == 2,
 447              set_as_failed=args.set_as_failed)
 448
 449 if __name__ == "__main__":
 450     main(sys.argv[1:])