Update README add Debian releases
[MACS.git] / bin / macs2
blob961b8b75a5d4d4e42c58c12c01aadd1e111ac00d
1 #!/usr/bin/env python
2 # Time-stamp: <2019-09-25 10:24:38 taoliu>
4 """Description: MACS v2 main executable.
6 This code is free software; you can redistribute it and/or modify it
7 under the terms of the BSD License (see the file LICENSE included with
8 the distribution).
9 """
11 # ------------------------------------
12 # python modules
13 # ------------------------------------
15 import os
16 import sys
17 import argparse as ap
18 import tempfile
20 # ------------------------------------
21 # own python modules
22 # ------------------------------------
23 from MACS2.Constants import *
25 # ------------------------------------
26 # Main function
27 # ------------------------------------
28 def main():
29 """The Main function/pipeline for MACS.
31 """
32 # Parse options...
33 argparser = prepare_argparser()
34 args = argparser.parse_args()
36 subcommand = args.subcommand
38 if args.outdir:
39 # use a output directory to store MACS output
40 if not os.path.exists( args.outdir ):
41 try:
42 os.makedirs( args.outdir )
43 except:
44 sys.exit( "Output directory (%s) could not be created. Terminating program." % args.outdir )
47 if subcommand == "callpeak":
48 # General call peak
49 from MACS2.callpeak_cmd import run
50 try:
51 run( args )
52 except MemoryError:
53 sys.exit( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
54 #elif subcommand == "diffpeak":
55 # # differential peak calling w/ bedgraphs + optional peak regions
56 # from MACS2.diffpeak_cmd import run
57 # run( args )
58 elif subcommand == "bdgpeakcall":
59 # call peak from bedGraph
60 from MACS2.bdgpeakcall_cmd import run
61 run( args )
62 elif subcommand == "bdgbroadcall":
63 # call broad peak from bedGraph
64 from MACS2.bdgbroadcall_cmd import run
65 run( args )
66 elif subcommand == "bdgcmp":
67 # compare treatment and control to make enrichment scores
68 from MACS2.bdgcmp_cmd import run
69 run( args )
70 elif subcommand == "bdgopt":
71 # operations on the score column of bedGraph file
72 from MACS2.bdgopt_cmd import run
73 run( args )
74 elif subcommand == "cmbreps":
75 # combine replicates
76 from MACS2.cmbreps_cmd import run
77 run( args )
78 elif subcommand == "randsample":
79 # randomly sample sequencing reads, and save as bed file
80 from MACS2.randsample_cmd import run
81 try:
82 run( args )
83 except MemoryError:
84 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
85 elif subcommand == "filterdup":
86 # filter out duplicate reads, and save as bed file
87 from MACS2.filterdup_cmd import run
88 try:
89 run( args )
90 except MemoryError:
91 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
92 elif subcommand == "bdgdiff":
93 # differential calling
94 from MACS2.bdgdiff_cmd import run
95 run( args )
96 elif subcommand == "refinepeak":
97 # refine peak summits
98 from MACS2.refinepeak_cmd import run
99 try:
100 run( args )
101 except MemoryError:
102 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
103 elif subcommand == "predictd":
104 # predict d or fragment size
105 from MACS2.predictd_cmd import run
106 try:
107 run( args )
108 except MemoryError:
109 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
110 elif subcommand == "pileup":
111 # pileup alignment results with a given extension method
112 from MACS2.pileup_cmd import run
113 try:
114 run( args )
115 except MemoryError:
116 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
118 def prepare_argparser ():
119 """Prepare optparser object. New options will be added in this
120 function first.
123 description = "%(prog)s -- Model-based Analysis for ChIP-Sequencing"
124 epilog = "For command line options of each command, type: %(prog)s COMMAND -h"
125 #Check community site: http://groups.google.com/group/macs-announcement/
126 #Source code: https://github.com/taoliu/MACS/"
127 # top-level parser
128 argparser = ap.ArgumentParser( description = description, epilog = epilog ) #, usage = usage )
129 argparser.add_argument("--version", action="version", version="%(prog)s "+MACS_VERSION)
130 subparsers = argparser.add_subparsers( dest = 'subcommand' ) #help="sub-command help")
131 subparsers.required = True
133 # command for 'callpeak'
134 add_callpeak_parser( subparsers )
136 # # command for 'diffpeak'
137 # add_diffpeak_parser( subparsers )
139 # command for 'bdgpeakcall'
140 add_bdgpeakcall_parser( subparsers )
142 # command for 'bdgbroadcall'
143 add_bdgbroadcall_parser( subparsers )
145 # command for 'bdgcmp'
146 add_bdgcmp_parser( subparsers )
148 # command for 'bdgopt'
149 add_bdgopt_parser( subparsers )
151 # command for 'cmbreps'
152 add_cmbreps_parser( subparsers )
154 # command for 'bdgdiff'
155 add_bdgdiff_parser( subparsers )
157 # command for 'filterdup'
158 add_filterdup_parser( subparsers )
160 # command for 'predictd'
161 add_predictd_parser( subparsers )
163 # command for 'pileup'
164 add_pileup_parser( subparsers )
166 # command for 'randsample'
167 add_randsample_parser( subparsers )
169 # command for 'refinepeak'
170 add_refinepeak_parser( subparsers )
172 return argparser
174 def add_outdir_option ( parser ):
175 parser.add_argument("--outdir", dest = "outdir", type = str, default = '',
176 help = "If specified all output files will be written to that directory. Default: the current working directory")
178 def add_output_group ( parser, required = True ):
179 output_group = parser.add_mutually_exclusive_group( required = required )
180 output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str,
181 help = "Output file name. Mutually exclusive with --o-prefix." )
182 output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
183 help = "Output file prefix. Mutually exclusive with -o/--ofile." )
185 def add_callpeak_parser( subparsers ):
186 """Add main function 'peak calling' argument parsers.
188 argparser_callpeak = subparsers.add_parser("callpeak", help="Main MACS2 Function: Call peaks from alignment results.")
190 # group for input files
191 group_input = argparser_callpeak.add_argument_group( "Input files arguments" )
192 group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, nargs = "+",
193 help = "ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED." )
194 group_input.add_argument( "-c", "--control", dest = "cfile", type = str, nargs = "*",
195 help = "Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.")
196 group_input.add_argument( "-f", "--format", dest = "format", type = str,
197 choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
198 "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
199 "BAMPE", "BEDPE"),
200 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS2 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
201 default = "AUTO" )
202 group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
203 help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
204 group_input.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
205 help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
206 group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1",
207 help = "It controls the behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS2 will still read them although the reads may be decided by MACS2 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS2 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1" )
209 # group for output files
210 group_output = argparser_callpeak.add_argument_group( "Output arguments" )
211 add_outdir_option( group_output )
212 group_output.add_argument( "-n", "--name", dest = "name", type = str,
213 help = "Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
214 default = "NA" )
215 group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
216 help = "Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False",
217 default = False )
218 group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
219 help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
220 group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
221 help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
223 group_output.add_argument( "--SPMR", dest = "do_SPMR", action = "store_true", default = False,
224 help = "If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS2 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False" )
225 # group for bimodal
226 group_bimodal = argparser_callpeak.add_argument_group( "Shifting model arguments" )
227 group_bimodal.add_argument( "--nomodel", dest = "nomodel", action = "store_true", default = False,
228 help = "Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False" )
229 group_bimodal.add_argument( "--shift", dest = "shift", type = int, default = 0,
230 help = "(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. " )
231 group_bimodal.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
232 help = "The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option." )
233 group_bimodal.add_argument( "--bw", dest = "bw", type = int, default = 300,
234 help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300")
235 group_bimodal.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
236 help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
237 group_bimodal.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
238 help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50" )
240 group_bimodal.add_argument( "--fix-bimodal", dest = "onauto", action = "store_true",
241 help = "Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False",
242 default = False )
244 # General options.
245 group_callpeak = argparser_callpeak.add_argument_group( "Peak calling arguments" )
246 p_or_q_group = group_callpeak.add_mutually_exclusive_group()
247 p_or_q_group.add_argument( "-q", "--qvalue", dest = "qvalue", type = float, default = 0.05,
248 help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive." )
249 p_or_q_group.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
250 help = "Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file." )
252 # about scaling
253 group_callpeak.add_argument( "--scale-to", dest = "scaleto", type = str, choices = ("large", "small"),
254 help = "When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'." )
255 group_callpeak.add_argument( "--ratio", dest = "ratio", type = float, default = 1.0,
256 help = "When set, use a custom scaling ratio of ChIP/control (e.g. calculated using NCIS) for linear scaling. DEFAULT: ingore" )
257 group_callpeak.add_argument( "--down-sample", dest = "downsample", action = "store_true", default = False,
258 help = "When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. <not implmented>If used together with --SPMR, 1 million unique reads will be randomly picked.</not implemented> Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False" )
259 group_callpeak.add_argument( "--seed", dest = "seed", type = int, default = -1,
260 help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
261 group_callpeak.add_argument( "--tempdir", dest="tempdir", default=tempfile.gettempdir(),
262 help = "Optional directory to store temp files. DEFAULT: %(default)s")
263 group_callpeak.add_argument( "--nolambda", dest = "nolambda", action = "store_true",
264 help = "If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ",
265 default = False )
266 group_callpeak.add_argument( "--slocal", dest = "smalllocal", type = int, default = 1000,
267 help = "The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 " )
268 group_callpeak.add_argument( "--llocal", dest = "largelocal", type = int, default = 10000,
269 help = "The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000." )
270 group_callpeak.add_argument( "--max-gap", dest = "maxgap", type = int,
271 help = "Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size." )
272 group_callpeak.add_argument( "--min-length", dest = "minlen", type = int,
273 help = "Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value." )
274 group_callpeak.add_argument( "--broad", dest = "broad", action = "store_true",
275 help = "If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default = False )
276 group_callpeak.add_argument( "--broad-cutoff", dest = "broadcutoff", type = float, default = 0.1,
277 help = "Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS2 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 " )
278 group_callpeak.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
279 help = "While set, MACS2 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default = False )
281 # post-processing options
282 group_postprocessing = argparser_callpeak.add_argument_group( "Post-processing options" )
283 postprocess_group = group_postprocessing.add_mutually_exclusive_group()
285 postprocess_group.add_argument( "--call-summits", dest="call_summits", action="store_true",
286 help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False",default=False)
287 # postprocess_group.add_argument( "--refine-peaks", dest="refine_peaks", action="store_true",
288 # help="If set, MACS will refine peak summits by measuring balance of waston/crick tags. Those peaks without balancing tags will be disgarded. Peak summits will be redefined and reassgined with scores. Note, duplicate reads will be put back while calculating read balance. And more memory will be used. Default: False", default=False )
289 group_postprocessing.add_argument( "--fe-cutoff", dest="fecutoff", type=float, default = 1.0,
290 help = "When set, the value will be used to filter out peaks with low fold-enrichment. Note, MACS2 use 1.0 as pseudocount while calculating fold-enrichment. DEFAULT: 1.0")
292 # other options
293 group_other = argparser_callpeak.add_argument_group( "Other options" )
294 group_other.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
295 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
297 # obsolete options
298 group_obsolete = argparser_callpeak.add_argument_group( "Obsolete options" )
299 group_obsolete.add_argument( "--to-large", dest = "tolarge", action = "store_true", default = False,
300 help = "Obsolete option. Please use '--scale-to large' instead." )
301 return
303 def add_diffpeak_parser( subparsers ):
304 """Add main function 'peak calling' argument parsers.
306 argparser_diffpeak = subparsers.add_parser("diffpeak", help="MACS2 Differential Peak Function: Call peaks from bedgraphs (or use optional peak regions) and determine peaks of differential occupancy")
309 # group for input files
310 group_input = argparser_diffpeak.add_argument_group( "Input files arguments" )
311 group_input.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
312 help = "MACS pileup bedGraph for condition 1. REQUIRED" )
313 group_input.add_argument( "--t2", dest="t2bdg", type = str, required = True,
314 help = "MACS pileup bedGraph for condition 2. REQUIRED" )
315 group_input.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
316 help = "MACS control lambda bedGraph for condition 1. REQUIRED" )
317 group_input.add_argument( "--c2", dest="c2bdg", type = str, required = True,
318 help = "MACS control lambda bedGraph for condition 2. REQUIRED" )
319 group_input.add_argument( "--peaks1", dest = "peaks1", type = str, default='',
320 help = "MACS peaks.xls file for condition 1. Optional but must specify peaks2 if present" )
321 group_input.add_argument( "--peaks2", dest="peaks2", type = str, default='',
322 help = "MACS peaks.xls file for condition 2. Optional but must specify peaks1 if present" )
323 group_input.add_argument( "-d", "--depth-multiplier", dest = "depth", type = float, default = [1.0], nargs = "+",
324 help = "Sequence depth in million reads. If two depths are different, use '-d X -d Y' for X million reads in condition 1 and Y million reads in condition 2. If they are same, use '-d X' for X million reads in both condition 1 and condition 2 (e.g. the bedGraph files are from 'callpeak --SPMR'). Default: 1 (if you use 'macs2 callpeak --SPMR' to generate bdg files, we recommend using the smaller depth as a multiplier)" )
325 # group_input.add_argument( "-f", "--format", dest = "format", type = str,
326 # choices = ("AUTO", "BED", "XLS"),
327 # help = "Format of peak regions file, \"AUTO\", \"BED\" or \"XLS\". The default AUTO option will let MACS decide which format the file is based on the file extension. DEFAULT: \"AUTO\"",
328 # default = "AUTO" )
330 # group for output files
331 group_output = argparser_diffpeak.add_argument_group( "Output arguments" )
332 add_outdir_option( group_output )
333 group_output.add_argument( "-n", "--name", dest = "name", type = str,
334 help = "Experiment name, which will be used to generate output file names. DEFAULT: \"diffpeak\"",
335 default = "diffpeak" )
336 group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
337 help = "Whether or not to save basewise p/qvalues from every peak region into a bedGraph file. DEFAULT: False",
338 default = False )
339 group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
340 help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
341 group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
342 help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
344 # General options.
345 group_diffpeak = argparser_diffpeak.add_argument_group( "Peak calling arguments" )
346 p_or_q_group = group_diffpeak.add_mutually_exclusive_group()
347 p_or_q_group.add_argument( "-q", "--qvalue", dest = "diff_qvalue", type = float, default = 0.05,
348 help = "Minimum FDR (q-value) cutoff for differences. DEFAULT: 0.05. -q and -p are mutually exclusive." )
349 p_or_q_group.add_argument( "-p", "--pvalue", dest = "diff_pvalue", type = float,
350 help = "Pvalue cutoff for differences. DEFAULT: not set. -q and -p are mutually exclusive." )
351 p_or_q_group2 = group_diffpeak.add_mutually_exclusive_group()
352 p_or_q_group2.add_argument( "--peaks-qvalue", dest = "peaks_qvalue", type = float, default = 0.05,
353 help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
354 p_or_q_group2.add_argument( "--peaks-pvalue", dest = "peaks_pvalue", type = float,
355 help = "Pvalue cutoff for peak detection. DEFAULT: not set. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
356 group_diffpeak.add_argument( "-m", "--peak-min-len", dest = "pminlen", type = int,
357 help = "Minimum length of peak regions. DEFAULT: 200", default = 200 )
358 group_diffpeak.add_argument( "--diff-min-len", dest = "dminlen", type = int,
359 help = "Minimum length of differential region (must overlap a valid peak). DEFAULT: 50", default = 100 )
360 group_diffpeak.add_argument( "--ignore-duplicate-peaks", dest="ignore_duplicate_peaks", action="store_false",
361 help="If set, MACS will ignore duplicate regions with identical coordinates. Helpful if --call-summits was set. DEFAULT: True",default=True)
362 return
364 def add_filterdup_parser( subparsers ):
365 argparser_filterdup = subparsers.add_parser( "filterdup",
366 help = "Remove duplicate reads at the same position, then save the rest alignments to BED or BEDPE file. If you use '--keep-dup all option', this script can be utilized to convert any acceptable format into BED or BEDPE format." )
367 argparser_filterdup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
368 help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
369 argparser_filterdup.add_argument( "-f", "--format", dest = "format", type = str,
370 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
371 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
372 default = "AUTO" )
373 argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
374 help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
375 argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int,
376 help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" )
377 argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
378 help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" )
379 argparser_filterdup.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "auto",
380 help = "It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS2 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS2 will still read them although the reads may be decided by MACS2 as duplicate later. Default: auto" )
381 argparser_filterdup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
382 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
384 argparser_filterdup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
385 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
386 add_outdir_option( argparser_filterdup )
387 argparser_filterdup.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
388 help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
389 default = "stdout" )
390 argparser_filterdup.add_argument( "-d", "--dry-run", dest="dryrun", action="store_true", default=False,
391 help = "When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set" )
392 return
394 def add_bdgpeakcall_parser( subparsers ):
395 """Add function 'peak calling on bedGraph' argument parsers.
396 """
397 argparser_bdgpeakcall = subparsers.add_parser( "bdgpeakcall",
398 help = "Call peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
399 argparser_bdgpeakcall.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
400 help = "MACS score in bedGraph. REQUIRED" )
401 argparser_bdgpeakcall.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
402 help = "Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS2, score 5 means pvalue 1e-5. DEFAULT: 5", default = 5 )
403 argparser_bdgpeakcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
404 help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
405 argparser_bdgpeakcall.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
406 help = "maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default = 30 )
407 argparser_bdgpeakcall.add_argument( "--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False)
408 # help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False)
409 argparser_bdgpeakcall.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
410 help = "While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default = False )
411 argparser_bdgpeakcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
412 help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
414 add_outdir_option( argparser_bdgpeakcall )
415 add_output_group( argparser_bdgpeakcall )
417 return
419 def add_bdgbroadcall_parser( subparsers ):
420 """Add function 'broad peak calling on bedGraph' argument parsers.
422 argparser_bdgbroadcall = subparsers.add_parser( "bdgbroadcall",
423 help = "Call broad peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
424 argparser_bdgbroadcall.add_argument( "-i", "--ifile", dest = "ifile" , type = str, required = True,
425 help = "MACS score in bedGraph. REQUIRED" )
426 argparser_bdgbroadcall.add_argument( "-c", "--cutoff-peak", dest = "cutoffpeak", type = float,
427 help = "Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 2 means qvalue 0.01. DEFAULT: 2",
428 default = 2 )
429 argparser_bdgbroadcall.add_argument( "-C", "--cutoff-link", dest = "cutofflink", type = float,
430 help = "Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default = 1 )
431 argparser_bdgbroadcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
432 help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
433 argparser_bdgbroadcall.add_argument( "-g", "--lvl1-max-gap", dest = "lvl1maxgap", type = int,
434 help = "maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default = 30 )
435 argparser_bdgbroadcall.add_argument( "-G", "--lvl2-max-gap", dest = "lvl2maxgap", type = int,
436 help = "maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default = 800)
438 add_outdir_option( argparser_bdgbroadcall )
439 add_output_group( argparser_bdgbroadcall )
440 return
442 def add_bdgcmp_parser( subparsers ):
443 """Add function 'peak calling on bedGraph' argument parsers.
445 argparser_bdgcmp = subparsers.add_parser( "bdgcmp",
446 help = "Deduct noise by comparing two signal tracks in bedGraph. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
447 argparser_bdgcmp.add_argument( "-t", "--tfile", dest = "tfile", type = str, required = True,
448 help = "Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED")
449 argparser_bdgcmp.add_argument( "-c", "--cfile", dest = "cfile", type = str, required = True,
450 help = "Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED")
451 argparser_bdgcmp.add_argument( "-S", "--scaling-factor", dest = "sfactor", type = float, default = 1.0,
452 help = "Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS2 callpeak, and plan to calculate scores as MACS2 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0")
453 argparser_bdgcmp.add_argument( "-p", "--pseudocount", dest = "pseudocount", type = float, default = 0.0,
454 help = "The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.")
456 argparser_bdgcmp.add_argument( "-m", "--method", dest = "method", type = str, nargs = "+",
457 choices = ( "ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max" ),
458 help = "Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois")
460 add_outdir_option( argparser_bdgcmp )
461 output_group = argparser_bdgcmp.add_mutually_exclusive_group( required = True )
462 output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
463 help = "The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile." )
464 output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = "+",
465 help = "Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m." )
466 return
468 def add_bdgopt_parser( subparsers ):
469 """Add function 'operations on score column of bedGraph' argument parsers.
470 """
471 argparser_bdgopt = subparsers.add_parser( "bdgopt",
472 help = "Operations on score column of bedGraph file. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
473 argparser_bdgopt.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
474 help = "MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED" )
475 argparser_bdgopt.add_argument( "-m", "--method", dest = "method", type = str,
476 choices = ( "multiply", "add", "p2q", "max", "min" ),
477 help = "Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS2. Any other types of score will cause unexpected errors.", default="p2q")
478 argparser_bdgopt.add_argument( "-p", "--extra-param", dest = "extraparam", type = float, nargs = "*",
479 help = "The extra parameter for METHOD. Check the detail of -m option.")
480 add_outdir_option( argparser_bdgopt )
481 argparser_bdgopt.add_argument( "-o", "--ofile", dest = "ofile", type = str,
482 help = "Output BEDGraph filename.", required = True )
483 return
485 def add_cmbreps_parser( subparsers ):
486 """Add function 'combine replicates' argument parsers.
488 argparser_cmbreps = subparsers.add_parser( "cmbreps",
489 help = "Combine BEDGraphs of scores from replicates. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
490 argparser_cmbreps.add_argument( "-i", dest = "ifile", type = str, required = True, nargs = "+",
491 help = "MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED" )
492 # argparser_cmbreps.add_argument( "-w", dest = "weights", type = float, nargs = "*",
493 # help = "Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE." )
494 argparser_cmbreps.add_argument( "-m", "--method", dest = "method", type = str,
495 choices = ( "fisher", "max", "mean" ),
496 help = "Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher")
497 add_outdir_option( argparser_cmbreps )
498 argparser_cmbreps.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True,
499 help = "Output BEDGraph filename for combined scores." )
500 return
502 def add_randsample_parser( subparsers ):
503 argparser_randsample = subparsers.add_parser( "randsample",
504 help = "Randomly sample number/percentage of total reads." )
505 argparser_randsample.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
506 help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
508 p_or_n_group = argparser_randsample.add_mutually_exclusive_group( required = True )
509 p_or_n_group.add_argument( "-p", "--percentage", dest = "percentage", type = float,
510 help = "Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. REQUIRED")
511 p_or_n_group.add_argument( "-n", "--number", dest = "number", type = float,
512 help = "Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED" )
513 argparser_randsample.add_argument( "--seed", dest = "seed", type = int, default = -1,
514 help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
515 argparser_randsample.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
516 help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
517 default = None)
518 add_outdir_option( argparser_randsample )
519 argparser_randsample.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
520 help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
521 argparser_randsample.add_argument( "-f", "--format", dest = "format", type = str,
522 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
523 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
524 default = "AUTO" )
525 argparser_randsample.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
526 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
528 argparser_randsample.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
529 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
530 return
532 def add_bdgdiff_parser( subparsers ):
533 argparser_bdgdiff = subparsers.add_parser( "bdgdiff",
534 help = "Differential peak detection based on paired four bedgraph files. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
535 argparser_bdgdiff.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
536 help = "MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
537 argparser_bdgdiff.add_argument( "--t2", dest="t2bdg", type = str, required = True,
538 help = "MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
539 argparser_bdgdiff.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
540 help = "MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
541 argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True,
542 help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
543 argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float,
544 help = "logLR cutoff. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
545 argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int,
546 help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 )
547 argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
548 help = "Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default = 100 )
549 argparser_bdgdiff.add_argument( "--d1", "--depth1", dest = "depth1", type = float, default = 1.0,
550 help = "Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1" )
551 argparser_bdgdiff.add_argument( "--d2", "--depth2", dest = "depth2", type = float, default = 1.0,
552 help = "Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1" )
554 add_outdir_option( argparser_bdgdiff )
555 output_group = argparser_bdgdiff.add_mutually_exclusive_group( required = True )
556 output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
557 help = "Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile." )
558 output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = 3,
559 help = "Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix." )
561 return
563 def add_refinepeak_parser( subparsers ):
564 argparser_refinepeak = subparsers.add_parser( "refinepeak",
565 help = "(Experimental) Take raw reads alignment, refine peak summits and give scores measuring balance of waston/crick tags. Inspired by SPP." )
566 argparser_refinepeak.add_argument( "-b", dest = "bedfile", type = str, required = True,
567 help = "Candidate peak file in BED format. REQUIRED." )
568 argparser_refinepeak.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
569 help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
570 argparser_refinepeak.add_argument( "-f", "--format", dest = "format", type = str,
571 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"),
572 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
573 default = "AUTO" )
574 argparser_refinepeak.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
575 help = "Cutoff DEFAULT: 5", default = 5 )
576 argparser_refinepeak.add_argument( "-w", "--window-size", dest= "windowsize", help = 'Scan window size on both side of the summit (default: 100bp)',
577 type = int, default = 200)
578 argparser_refinepeak.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
579 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
581 argparser_refinepeak.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
582 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
584 add_outdir_option( argparser_refinepeak )
585 add_output_group( argparser_refinepeak )
587 return
589 def add_predictd_parser( subparsers ):
590 """Add main function 'predictd' argument parsers.
592 argparser_predictd = subparsers.add_parser("predictd", help="Predict d or fragment size from alignment results. *Will NOT filter duplicates*")
594 # group for input files
595 argparser_predictd.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
596 help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
597 argparser_predictd.add_argument( "-f", "--format", dest = "format", type = str,
598 choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
599 "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
600 "BAMPE", "BEDPE"),
601 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
602 default = "AUTO" )
603 argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
604 help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
605 argparser_predictd.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
606 help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
607 argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300,
608 help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300")
609 argparser_predictd.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
610 help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
611 argparser_predictd.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
612 help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50" )
614 add_outdir_option( argparser_predictd )
615 argparser_predictd.add_argument( "--rfile", dest = "rfile", type = str, default = "predictd",
616 help = "PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd' and R file will be predicted_model.R" )
617 argparser_predictd.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
618 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
620 argparser_predictd.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
621 help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
623 return
625 def add_pileup_parser( subparsers ):
626 argparser_pileup = subparsers.add_parser( "pileup",
627 help = "Pileup aligned reads with a given extension size (fragment size or d in MACS language). Note there will be no step for duplicate reads filtering or sequencing depth scaling, so you may need to do certain pre/post-processing." )
628 argparser_pileup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
629 help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
630 argparser_pileup.add_argument( "-o", "--ofile", dest = "outputfile", type = str, required = True,
631 help = "Output bedGraph file name. If not specified, will write to standard output. REQUIRED." )
632 add_outdir_option( argparser_pileup )
633 argparser_pileup.add_argument( "-f", "--format", dest = "format", type = str,
634 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
635 help = "Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS2 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.",
636 default = "AUTO" )
637 argparser_pileup.add_argument( "-B", "--both-direction", dest = "bothdirection", action = "store_true", default = False,
638 help = "By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS2 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS2 way of piling up control reads. However MACS2 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False" )
640 argparser_pileup.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
641 help = "The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 " )
642 argparser_pileup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
643 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
645 argparser_pileup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
646 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
647 return
650 if __name__ == '__main__':
651 try:
652 main()
653 except KeyboardInterrupt:
654 sys.stderr.write("User interrupted me! ;-) Bye!\n")
655 sys.exit(0)