2 runner for gsnap on methylation data.
3 gmap_build, gsnap and samtools must be on the PATH of the calling environment
10 import subprocess
as sp
12 def sh(cmd
, log
=sys
.stderr
, wait
=True):
13 print >>sys
.stderr
, "[running command] %s" % cmd
14 p
= sp
.Popen(cmd
, shell
=True, stderr
=sp
.PIPE
, stdout
=sp
.PIPE
)
19 if p
.returncode
!= 0 or "aborted" in p
.stderr
.read():
20 sys
.exit(p
.returncode
)
23 def nopen(f
, mode
="rb"):
24 if not isinstance(f
, basestring
): return f
28 def gmap_built(ref_dir
, ref_base
):
29 for ext
in ("chromosome", "contig.iit", "ref12153offsetscomp",
30 "chromosome.iit", "genomecomp",
31 "chrsubset", "maps", "version", "contig",
34 f
= "%s/%s/%s.%s" % (ref_dir
, ref_base
, ref_base
, ext
)
36 print >>sys
.stderr
, "%s does not exist" % f
40 def cmetindexed(ref_dir
, ref_base
, kmer
):
42 for ext
in ("metct12%i3offsetscomp",
43 "metct12%i3gammaptrs", "metga12%i3offsetscomp",
44 "metga12%i3gammaptrs"):
46 f
= "%s/%s/%s.%s" % (ref_dir
, ref_base
, ref_base
, ext
)
48 print >>sys
.stderr
, "%s does not exist" % f
52 def gsnap_index(reference
, kmer
=15):
53 ref_dir
, ref_base
= check_reference(reference
)
54 ref_base
+= "." + str(kmer
)
56 cmd
= "gmap_build --no-sarray -k %(kmer)i -D %(ref_dir)s -d %(ref_base)s %(reference)s"
58 if not gmap_built(ref_dir
, ref_base
):
61 print >>sys
.stderr
, "[skipping command] %s" % cmd
63 cmd_cmet
= "cmetindex -d %(ref_base)s -F %(ref_dir)s -k %(kmer)d\n"
65 if not cmetindexed(ref_dir
, ref_base
, kmer
):
68 print >>sys
.stderr
, "[skipping command] %s" % cmd
70 def check_reference(reference
):
71 ref_dir
= op
.dirname(reference
)
72 ref_base
= op
.splitext(op
.basename(reference
))[0] # locals
73 assert os
.access(reference
, os
.R_OK
), ("reference not found / readable")
74 assert os
.access(ref_dir
, os
.W_OK
), ("%s must be writable" % (ref_dir
))
75 return ref_dir
, ref_base
77 def gsnap_meth(reference
, reads
, prefix
, kmer
=15, stranded
=False,
78 extra_args
="", threads
=1, rg
=None):
80 ref_dir
, ref_base
= check_reference(reference
)
81 ref_base
+= "." + str(kmer
)
82 mode
= ["cmet-nonstranded", "cmet-stranded"][int(stranded
)]
83 reads_str
= " ".join(reads
)
84 if any(r
.endswith(".gz") for r
in reads
):
85 extra_args
= extra_args
+ " --gunzip"
86 cmd_gsnap
= "set -eo pipefail;"
87 cmd_gsnap
+= "gsnap -B 4 --npaths 1 --quiet-if-excessive \
88 --nthreads {threads} \
89 -A sam -k {kmer} -D {ref_dir} -d {ref_base} --mode {mode} \
93 --read-group-id {rg} --read-group-name {rg} \
94 {extra_args} {reads_str}"
95 cmd_gsnap
+= "| samtools view -bS - | samtools sort - {prefix}"
96 cmd_gsnap
= cmd_gsnap
.format(**locals())
99 cmd_index
= "samtools index {prefix}.bam".format(**locals())
103 if args
.threads
is None:
104 import multiprocessing
105 threads
= multiprocessing
.cpu_count() # locals
107 threads
= args
.threads
108 reference
, kmer
= op
.abspath(args
.reference
), args
.kmer
109 reads
= map(op
.abspath
, args
.reads
)
110 rg
= rname(reads
[0], reads
[1])
111 gsnap_meth(reference
, reads
, args
.prefix
, args
.kmer
, args
.stranded
,
112 args
.extra_args
, threads
, rg
=rg
)
116 n
= op
.basename(op
.splitext(f
)[0])
117 if n
.endswith('.fastq'): n
= n
[:-6]
118 if n
.endswith(('.fq', '.r1', '.r2')): n
= n
[:-3]
120 return "".join(a
for a
, b
in zip(name(fq1
), name(fq2
)) if a
== b
) or 'bm'
124 if len(sys
.argv
) > 1 and "index" == sys
.argv
[1]:
125 sys
.exit(gsnap_index(sys
.argv
[2], kmer
=int(sys
.argv
[3])))
127 p
= argparse
.ArgumentParser(description
=__doc__
,
128 formatter_class
=argparse
.RawDescriptionHelpFormatter
)
129 p
.add_argument("-r", "--reference", help="reference fasta")
130 p
.add_argument("-k", dest
="kmer", help="kmer length for gsnap. default:" \
131 + "%(default)i", type=int, default
=15)
132 p
.add_argument("-t", "--threads", dest
="threads", default
=None,
133 help="number of threads for gsnap to use", type=int)
134 p
.add_argument("--stranded", action
="store_true", default
=False, help=\
135 "by default, non-stranded library prep is assumed")
136 p
.add_argument("--prefix", help="prefix for output bam",
137 default
="gsnap-meth")
138 p
.add_argument("--extra-args", dest
="extra_args", help="extra arguments"
139 " to send to gsnap", default
="")
141 p
.add_argument('reads', nargs
='+', help='reads files (if 2 files are'
142 'they are assumed to be paired end)')
145 args
= p
.parse_args()
150 if (not len(args
.reads
) in (1, 2)) or args
.reference
is None:
151 sys
.exit(not p
.print_help())
155 if __name__
== "__main__":
157 if doctest
.testmod(optionflags
=doctest
.ELLIPSIS |\
158 doctest
.NORMALIZE_WHITESPACE
).failed
== 0: