modified: makesrc3.mk
[GalaxyCodeBases.git] / BioInfo / BS-Seq / bwa-meth / compare / src / gsnap-meth.py
blob30fb75b65486787cd9bc96e1110b91f27255ed6b
1 """
2 runner for gsnap on methylation data.
3 gmap_build, gsnap and samtools must be on the PATH of the calling environment
4 """
6 import argparse
7 import sys
8 import os
9 import os.path as op
10 import subprocess as sp
12 def sh(cmd, log=sys.stderr, wait=True):
13 print >>sys.stderr, "[running command] %s" % cmd
14 p = sp.Popen(cmd, shell=True, stderr=sp.PIPE, stdout=sp.PIPE)
15 if wait:
16 for line in p.stderr:
17 print >>log, line,
18 p.wait()
19 if p.returncode != 0 or "aborted" in p.stderr.read():
20 sys.exit(p.returncode)
21 return p
23 def nopen(f, mode="rb"):
24 if not isinstance(f, basestring): return f
25 return open(f, mode)
28 def gmap_built(ref_dir, ref_base):
29 for ext in ("chromosome", "contig.iit", "ref12153offsetscomp",
30 "chromosome.iit", "genomecomp",
31 "chrsubset", "maps", "version", "contig",
32 "ref12153gammaptrs"):
34 f = "%s/%s/%s.%s" % (ref_dir, ref_base, ref_base, ext)
35 if not op.exists(f):
36 print >>sys.stderr, "%s does not exist" % f
37 return False
38 return True
40 def cmetindexed(ref_dir, ref_base, kmer):
42 for ext in ("metct12%i3offsetscomp",
43 "metct12%i3gammaptrs", "metga12%i3offsetscomp",
44 "metga12%i3gammaptrs"):
45 ext = ext % kmer
46 f = "%s/%s/%s.%s" % (ref_dir, ref_base, ref_base, ext)
47 if not op.exists(f):
48 print >>sys.stderr, "%s does not exist" % f
49 return False
50 return True
52 def gsnap_index(reference, kmer=15):
53 ref_dir, ref_base = check_reference(reference)
54 ref_base += "." + str(kmer)
56 cmd = "gmap_build --no-sarray -k %(kmer)i -D %(ref_dir)s -d %(ref_base)s %(reference)s"
57 cmd %= locals()
58 if not gmap_built(ref_dir, ref_base):
59 sh(cmd)
60 else:
61 print >>sys.stderr, "[skipping command] %s" % cmd
63 cmd_cmet = "cmetindex -d %(ref_base)s -F %(ref_dir)s -k %(kmer)d\n"
64 cmd_cmet %= locals()
65 if not cmetindexed(ref_dir, ref_base, kmer):
66 sh(cmd_cmet)
67 else:
68 print >>sys.stderr, "[skipping command] %s" % cmd
70 def check_reference(reference):
71 ref_dir = op.dirname(reference)
72 ref_base = op.splitext(op.basename(reference))[0] # locals
73 assert os.access(reference, os.R_OK), ("reference not found / readable")
74 assert os.access(ref_dir, os.W_OK), ("%s must be writable" % (ref_dir))
75 return ref_dir, ref_base
77 def gsnap_meth(reference, reads, prefix, kmer=15, stranded=False,
78 extra_args="", threads=1, rg=None):
80 ref_dir, ref_base = check_reference(reference)
81 ref_base += "." + str(kmer)
82 mode = ["cmet-nonstranded", "cmet-stranded"][int(stranded)]
83 reads_str = " ".join(reads)
84 if any(r.endswith(".gz") for r in reads):
85 extra_args = extra_args + " --gunzip"
86 cmd_gsnap = "set -eo pipefail;"
87 cmd_gsnap += "gsnap -B 4 --npaths 1 --quiet-if-excessive \
88 --nthreads {threads} \
89 -A sam -k {kmer} -D {ref_dir} -d {ref_base} --mode {mode} \
90 --use-sarray 0 \
91 --pairexpect 300 \
92 --pairdev 250 \
93 --read-group-id {rg} --read-group-name {rg} \
94 {extra_args} {reads_str}"
95 cmd_gsnap += "| samtools view -bS - | samtools sort - {prefix}"
96 cmd_gsnap = cmd_gsnap.format(**locals())
97 sh(cmd_gsnap)
99 cmd_index = "samtools index {prefix}.bam".format(**locals())
100 sh(cmd_index)
102 def run(args):
103 if args.threads is None:
104 import multiprocessing
105 threads = multiprocessing.cpu_count() # locals
106 else:
107 threads = args.threads
108 reference, kmer = op.abspath(args.reference), args.kmer
109 reads = map(op.abspath, args.reads)
110 rg = rname(reads[0], reads[1])
111 gsnap_meth(reference, reads, args.prefix, args.kmer, args.stranded,
112 args.extra_args, threads, rg=rg)
114 def rname(fq1, fq2):
115 def name(f):
116 n = op.basename(op.splitext(f)[0])
117 if n.endswith('.fastq'): n = n[:-6]
118 if n.endswith(('.fq', '.r1', '.r2')): n = n[:-3]
119 return n
120 return "".join(a for a, b in zip(name(fq1), name(fq2)) if a == b) or 'bm'
123 def main():
124 if len(sys.argv) > 1 and "index" == sys.argv[1]:
125 sys.exit(gsnap_index(sys.argv[2], kmer=int(sys.argv[3])))
127 p = argparse.ArgumentParser(description=__doc__,
128 formatter_class=argparse.RawDescriptionHelpFormatter)
129 p.add_argument("-r", "--reference", help="reference fasta")
130 p.add_argument("-k", dest="kmer", help="kmer length for gsnap. default:" \
131 + "%(default)i", type=int, default=15)
132 p.add_argument("-t", "--threads", dest="threads", default=None,
133 help="number of threads for gsnap to use", type=int)
134 p.add_argument("--stranded", action="store_true", default=False, help=\
135 "by default, non-stranded library prep is assumed")
136 p.add_argument("--prefix", help="prefix for output bam",
137 default="gsnap-meth")
138 p.add_argument("--extra-args", dest="extra_args", help="extra arguments"
139 " to send to gsnap", default="")
141 p.add_argument('reads', nargs='+', help='reads files (if 2 files are'
142 'they are assumed to be paired end)')
144 try:
145 args = p.parse_args()
146 except:
147 p.print_help()
148 raise
150 if (not len(args.reads) in (1, 2)) or args.reference is None:
151 sys.exit(not p.print_help())
153 run(args)
155 if __name__ == "__main__":
156 import doctest
157 if doctest.testmod(optionflags=doctest.ELLIPSIS |\
158 doctest.NORMALIZE_WHITESPACE).failed == 0:
159 main()