Merge pull request #378 from taoliu/fix_setup_script_364
[MACS.git] / bin / macs2
blob86b81d9be4899c6f3d5137f452fc2b61ac4ab326
1 #!/usr/bin/env python
2 # Time-stamp: <2019-09-25 10:24:38 taoliu>
4 """Description: MACS v2 main executable.
6 This code is free software; you can redistribute it and/or modify it
7 under the terms of the BSD License (see the file LICENSE included with
8 the distribution).
9 """
11 # ------------------------------------
12 # python modules
13 # ------------------------------------
15 import os
16 import sys
17 import argparse as ap
18 import tempfile
20 # ------------------------------------
21 # own python modules
22 # ------------------------------------
23 from MACS2.Constants import *
25 # ------------------------------------
26 # Main function
27 # ------------------------------------
28 def main():
29 """The Main function/pipeline for MACS.
31 """
32 # Parse options...
33 argparser = prepare_argparser()
34 args = argparser.parse_args()
36 subcommand = args.subcommand
38 if args.outdir:
39 # use a output directory to store MACS output
40 if not os.path.exists( args.outdir ):
41 try:
42 os.makedirs( args.outdir )
43 except:
44 sys.exit( "Output directory (%s) could not be created. Terminating program." % args.outdir )
47 if subcommand == "callpeak":
48 # General call peak
49 from MACS2.callpeak_cmd import run
50 try:
51 run( args )
52 except MemoryError:
53 sys.exit( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
54 #elif subcommand == "diffpeak":
55 # # differential peak calling w/ bedgraphs + optional peak regions
56 # from MACS2.diffpeak_cmd import run
57 # run( args )
58 elif subcommand == "bdgpeakcall":
59 # call peak from bedGraph
60 from MACS2.bdgpeakcall_cmd import run
61 run( args )
62 elif subcommand == "bdgbroadcall":
63 # call broad peak from bedGraph
64 from MACS2.bdgbroadcall_cmd import run
65 run( args )
66 elif subcommand == "bdgcmp":
67 # compare treatment and control to make enrichment scores
68 from MACS2.bdgcmp_cmd import run
69 run( args )
70 elif subcommand == "bdgopt":
71 # operations on the score column of bedGraph file
72 from MACS2.bdgopt_cmd import run
73 run( args )
74 elif subcommand == "cmbreps":
75 # combine replicates
76 from MACS2.cmbreps_cmd import run
77 run( args )
78 elif subcommand == "randsample":
79 # randomly sample sequencing reads, and save as bed file
80 from MACS2.randsample_cmd import run
81 try:
82 run( args )
83 except MemoryError:
84 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
85 elif subcommand == "filterdup":
86 # filter out duplicate reads, and save as bed file
87 from MACS2.filterdup_cmd import run
88 try:
89 run( args )
90 except MemoryError:
91 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
92 elif subcommand == "bdgdiff":
93 # differential calling
94 from MACS2.bdgdiff_cmd import run
95 run( args )
96 elif subcommand == "refinepeak":
97 # refine peak summits
98 from MACS2.refinepeak_cmd import run
99 try:
100 run( args )
101 except MemoryError:
102 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
103 elif subcommand == "predictd":
104 # predict d or fragment size
105 from MACS2.predictd_cmd import run
106 try:
107 run( args )
108 except MemoryError:
109 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
110 elif subcommand == "pileup":
111 # pileup alignment results with a given extension method
112 from MACS2.pileup_cmd import run
113 try:
114 run( args )
115 except MemoryError:
116 sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
118 def prepare_argparser ():
119 """Prepare optparser object. New options will be added in this
120 function first.
123 description = "%(prog)s -- Model-based Analysis for ChIP-Sequencing"
124 epilog = "For command line options of each command, type: %(prog)s COMMAND -h"
125 #Check community site: http://groups.google.com/group/macs-announcement/
126 #Source code: https://github.com/taoliu/MACS/"
127 # top-level parser
128 argparser = ap.ArgumentParser( description = description, epilog = epilog ) #, usage = usage )
129 argparser.add_argument("--version", action="version", version="%(prog)s "+MACS_VERSION)
130 subparsers = argparser.add_subparsers( dest = 'subcommand' ) #help="sub-command help")
131 subparsers.required = True
133 # command for 'callpeak'
134 add_callpeak_parser( subparsers )
136 # # command for 'diffpeak'
137 # add_diffpeak_parser( subparsers )
139 # command for 'bdgpeakcall'
140 add_bdgpeakcall_parser( subparsers )
142 # command for 'bdgbroadcall'
143 add_bdgbroadcall_parser( subparsers )
145 # command for 'bdgcmp'
146 add_bdgcmp_parser( subparsers )
148 # command for 'bdgopt'
149 add_bdgopt_parser( subparsers )
151 # command for 'cmbreps'
152 add_cmbreps_parser( subparsers )
154 # command for 'bdgdiff'
155 add_bdgdiff_parser( subparsers )
157 # command for 'filterdup'
158 add_filterdup_parser( subparsers )
160 # command for 'predictd'
161 add_predictd_parser( subparsers )
163 # command for 'pileup'
164 add_pileup_parser( subparsers )
166 # command for 'randsample'
167 add_randsample_parser( subparsers )
169 # command for 'refinepeak'
170 add_refinepeak_parser( subparsers )
172 return argparser
174 def add_outdir_option ( parser ):
175 parser.add_argument("--outdir", dest = "outdir", type = str, default = '',
176 help = "If specified all output files will be written to that directory. Default: the current working directory")
178 def add_output_group ( parser, required = True ):
179 output_group = parser.add_mutually_exclusive_group( required = required )
180 output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str,
181 help = "Output file name. Mutually exclusive with --o-prefix." )
182 output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
183 help = "Output file prefix. Mutually exclusive with -o/--ofile." )
185 def add_callpeak_parser( subparsers ):
186 """Add main function 'peak calling' argument parsers.
188 argparser_callpeak = subparsers.add_parser("callpeak", help="Main MACS2 Function: Call peaks from alignment results.")
190 # group for input files
191 group_input = argparser_callpeak.add_argument_group( "Input files arguments" )
192 group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, nargs = "+",
193 help = "ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED." )
194 group_input.add_argument( "-c", "--control", dest = "cfile", type = str, nargs = "*",
195 help = "Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.")
196 group_input.add_argument( "-f", "--format", dest = "format", type = str,
197 choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
198 "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
199 "BAMPE", "BEDPE"),
200 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS2 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
201 default = "AUTO" )
202 group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
203 help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
204 group_input.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
205 help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
206 group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1",
207 help = "It controls the behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS2 will still read them although the reads may be decided by MACS2 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS2 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1" )
209 # group for output files
210 group_output = argparser_callpeak.add_argument_group( "Output arguments" )
211 add_outdir_option( group_output )
212 group_output.add_argument( "-n", "--name", dest = "name", type = str,
213 help = "Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
214 default = "NA" )
215 group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
216 help = "Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False",
217 default = False )
218 group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
219 help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
220 group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
221 help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
223 group_output.add_argument( "--SPMR", dest = "do_SPMR", action = "store_true", default = False,
224 help = "If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS2 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False" )
225 # group for bimodal
226 group_bimodal = argparser_callpeak.add_argument_group( "Shifting model arguments" )
227 group_bimodal.add_argument( "--nomodel", dest = "nomodel", action = "store_true", default = False,
228 help = "Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False" )
229 group_bimodal.add_argument( "--shift", dest = "shift", type = int, default = 0,
230 help = "(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. " )
231 group_bimodal.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
232 help = "The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option." )
233 group_bimodal.add_argument( "--bw", dest = "bw", type = int, default = 300,
234 help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300")
235 group_bimodal.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
236 help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
237 group_bimodal.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
238 help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50" )
240 group_bimodal.add_argument( "--fix-bimodal", dest = "onauto", action = "store_true",
241 help = "Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False",
242 default = False )
244 # General options.
245 group_callpeak = argparser_callpeak.add_argument_group( "Peak calling arguments" )
246 p_or_q_group = group_callpeak.add_mutually_exclusive_group()
247 p_or_q_group.add_argument( "-q", "--qvalue", dest = "qvalue", type = float, default = 0.05,
248 help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive." )
249 p_or_q_group.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
250 help = "Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file." )
252 # about scaling
253 group_callpeak.add_argument( "--scale-to", dest = "scaleto", type = str, choices = ("large", "small"),
254 help = "When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'." )
255 group_callpeak.add_argument( "--down-sample", dest = "downsample", action = "store_true", default = False,
256 help = "When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. <not implmented>If used together with --SPMR, 1 million unique reads will be randomly picked.</not implemented> Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False" )
257 group_callpeak.add_argument( "--seed", dest = "seed", type = int, default = -1,
258 help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
259 group_callpeak.add_argument( "--tempdir", dest="tempdir", default=tempfile.gettempdir(),
260 help = "Optional directory to store temp files. DEFAULT: %(default)s")
261 group_callpeak.add_argument( "--nolambda", dest = "nolambda", action = "store_true",
262 help = "If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ",
263 default = False )
264 group_callpeak.add_argument( "--slocal", dest = "smalllocal", type = int, default = 1000,
265 help = "The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 " )
266 group_callpeak.add_argument( "--llocal", dest = "largelocal", type = int, default = 10000,
267 help = "The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000." )
268 group_callpeak.add_argument( "--max-gap", dest = "maxgap", type = int,
269 help = "Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size." )
270 group_callpeak.add_argument( "--min-length", dest = "minlen", type = int,
271 help = "Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value." )
272 group_callpeak.add_argument( "--broad", dest = "broad", action = "store_true",
273 help = "If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default = False )
274 group_callpeak.add_argument( "--broad-cutoff", dest = "broadcutoff", type = float, default = 0.1,
275 help = "Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS2 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 " )
276 group_callpeak.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
277 help = "While set, MACS2 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default = False )
279 # post-processing options
280 group_postprocessing = argparser_callpeak.add_argument_group( "Post-processing options" )
281 postprocess_group = group_postprocessing.add_mutually_exclusive_group()
283 postprocess_group.add_argument( "--call-summits", dest="call_summits", action="store_true",
284 help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False",default=False)
285 # postprocess_group.add_argument( "--refine-peaks", dest="refine_peaks", action="store_true",
286 # help="If set, MACS will refine peak summits by measuring balance of waston/crick tags. Those peaks without balancing tags will be disgarded. Peak summits will be redefined and reassgined with scores. Note, duplicate reads will be put back while calculating read balance. And more memory will be used. Default: False", default=False )
287 group_postprocessing.add_argument( "--fe-cutoff", dest="fecutoff", type=float, default = 1.0,
288 help = "When set, the value will be used to filter out peaks with low fold-enrichment. Note, MACS2 use 1.0 as pseudocount while calculating fold-enrichment. DEFAULT: 1.0")
290 # other options
291 group_other = argparser_callpeak.add_argument_group( "Other options" )
292 group_other.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
293 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
295 # obsolete options
296 group_obsolete = argparser_callpeak.add_argument_group( "Obsolete options" )
297 group_obsolete.add_argument( "--to-large", dest = "tolarge", action = "store_true", default = False,
298 help = "Obsolete option. Please use '--scale-to large' instead." )
299 group_obsolete.add_argument( "--ratio", dest = "ratio", type = float, default = 1.0,
300 help = "Obsolete option. Originally designed to normalize treatment and control with customized ratio, now it won't have any effect." )
301 return
303 def add_diffpeak_parser( subparsers ):
304 """Add main function 'peak calling' argument parsers.
306 argparser_diffpeak = subparsers.add_parser("diffpeak", help="MACS2 Differential Peak Function: Call peaks from bedgraphs (or use optional peak regions) and determine peaks of differential occupancy")
309 # group for input files
310 group_input = argparser_diffpeak.add_argument_group( "Input files arguments" )
311 group_input.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
312 help = "MACS pileup bedGraph for condition 1. REQUIRED" )
313 group_input.add_argument( "--t2", dest="t2bdg", type = str, required = True,
314 help = "MACS pileup bedGraph for condition 2. REQUIRED" )
315 group_input.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
316 help = "MACS control lambda bedGraph for condition 1. REQUIRED" )
317 group_input.add_argument( "--c2", dest="c2bdg", type = str, required = True,
318 help = "MACS control lambda bedGraph for condition 2. REQUIRED" )
319 group_input.add_argument( "--peaks1", dest = "peaks1", type = str, default='',
320 help = "MACS peaks.xls file for condition 1. Optional but must specify peaks2 if present" )
321 group_input.add_argument( "--peaks2", dest="peaks2", type = str, default='',
322 help = "MACS peaks.xls file for condition 2. Optional but must specify peaks1 if present" )
323 group_input.add_argument( "-d", "--depth-multiplier", dest = "depth", type = float, default = [1.0], nargs = "+",
324 help = "Sequence depth in million reads. If two depths are different, use '-d X -d Y' for X million reads in condition 1 and Y million reads in condition 2. If they are same, use '-d X' for X million reads in both condition 1 and condition 2 (e.g. the bedGraph files are from 'callpeak --SPMR'). Default: 1 (if you use 'macs2 callpeak --SPMR' to generate bdg files, we recommend using the smaller depth as a multiplier)" )
325 # group_input.add_argument( "-f", "--format", dest = "format", type = str,
326 # choices = ("AUTO", "BED", "XLS"),
327 # help = "Format of peak regions file, \"AUTO\", \"BED\" or \"XLS\". The default AUTO option will let MACS decide which format the file is based on the file extension. DEFAULT: \"AUTO\"",
328 # default = "AUTO" )
330 # group for output files
331 group_output = argparser_diffpeak.add_argument_group( "Output arguments" )
332 add_outdir_option( group_output )
333 group_output.add_argument( "-n", "--name", dest = "name", type = str,
334 help = "Experiment name, which will be used to generate output file names. DEFAULT: \"diffpeak\"",
335 default = "diffpeak" )
336 group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
337 help = "Whether or not to save basewise p/qvalues from every peak region into a bedGraph file. DEFAULT: False",
338 default = False )
339 group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
340 help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
341 group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
342 help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
344 # General options.
345 group_diffpeak = argparser_diffpeak.add_argument_group( "Peak calling arguments" )
346 p_or_q_group = group_diffpeak.add_mutually_exclusive_group()
347 p_or_q_group.add_argument( "-q", "--qvalue", dest = "diff_qvalue", type = float, default = 0.05,
348 help = "Minimum FDR (q-value) cutoff for differences. DEFAULT: 0.05. -q and -p are mutually exclusive." )
349 p_or_q_group.add_argument( "-p", "--pvalue", dest = "diff_pvalue", type = float,
350 help = "Pvalue cutoff for differences. DEFAULT: not set. -q and -p are mutually exclusive." )
351 p_or_q_group2 = group_diffpeak.add_mutually_exclusive_group()
352 p_or_q_group2.add_argument( "--peaks-qvalue", dest = "peaks_qvalue", type = float, default = 0.05,
353 help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
354 p_or_q_group2.add_argument( "--peaks-pvalue", dest = "peaks_pvalue", type = float,
355 help = "Pvalue cutoff for peak detection. DEFAULT: not set. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
356 group_diffpeak.add_argument( "-m", "--peak-min-len", dest = "pminlen", type = int,
357 help = "Minimum length of peak regions. DEFAULT: 200", default = 200 )
358 group_diffpeak.add_argument( "--diff-min-len", dest = "dminlen", type = int,
359 help = "Minimum length of differential region (must overlap a valid peak). DEFAULT: 50", default = 100 )
360 group_diffpeak.add_argument( "--ignore-duplicate-peaks", dest="ignore_duplicate_peaks", action="store_false",
361 help="If set, MACS will ignore duplicate regions with identical coordinates. Helpful if --call-summits was set. DEFAULT: True",default=True)
362 return
364 def add_filterdup_parser( subparsers ):
365 argparser_filterdup = subparsers.add_parser( "filterdup",
366 help = "Remove duplicate reads at the same position, then save the rest alignments to BED or BEDPE file. If you use '--keep-dup all option', this script can be utilized to convert any acceptable format into BED or BEDPE format." )
367 argparser_filterdup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
368 help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
369 argparser_filterdup.add_argument( "-f", "--format", dest = "format", type = str,
370 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
371 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
372 default = "AUTO" )
373 argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
374 help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
375 argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int,
376 help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" )
377 argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
378 help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" )
379 argparser_filterdup.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "auto",
380 help = "It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS2 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS2 will still read them although the reads may be decided by MACS2 as duplicate later. Default: auto" )
381 argparser_filterdup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
382 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
384 argparser_filterdup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
385 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
386 add_outdir_option( argparser_filterdup )
387 argparser_filterdup.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
388 help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
389 default = "stdout" )
390 argparser_filterdup.add_argument( "-d", "--dry-run", dest="dryrun", action="store_true", default=False,
391 help = "When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set" )
392 return
394 def add_bdgpeakcall_parser( subparsers ):
395 """Add function 'peak calling on bedGraph' argument parsers.
396 """
397 argparser_bdgpeakcall = subparsers.add_parser( "bdgpeakcall",
398 help = "Call peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
399 argparser_bdgpeakcall.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
400 help = "MACS score in bedGraph. REQUIRED" )
401 argparser_bdgpeakcall.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
402 help = "Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS2, score 5 means pvalue 1e-5. DEFAULT: 5", default = 5 )
403 argparser_bdgpeakcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
404 help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
405 argparser_bdgpeakcall.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
406 help = "maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default = 30 )
407 argparser_bdgpeakcall.add_argument( "--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False)
408 # help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False)
409 argparser_bdgpeakcall.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
410 help = "While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default = False )
411 argparser_bdgpeakcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
412 help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
414 add_outdir_option( argparser_bdgpeakcall )
415 add_output_group( argparser_bdgpeakcall )
417 return
419 def add_bdgbroadcall_parser( subparsers ):
420 """Add function 'broad peak calling on bedGraph' argument parsers.
422 argparser_bdgbroadcall = subparsers.add_parser( "bdgbroadcall",
423 help = "Call broad peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
424 argparser_bdgbroadcall.add_argument( "-i", "--ifile", dest = "ifile" , type = str, required = True,
425 help = "MACS score in bedGraph. REQUIRED" )
426 argparser_bdgbroadcall.add_argument( "-c", "--cutoff-peak", dest = "cutoffpeak", type = float,
427 help = "Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 2 means qvalue 0.01. DEFAULT: 2",
428 default = 2 )
429 argparser_bdgbroadcall.add_argument( "-C", "--cutoff-link", dest = "cutofflink", type = float,
430 help = "Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default = 1 )
431 argparser_bdgbroadcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
432 help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
433 argparser_bdgbroadcall.add_argument( "-g", "--lvl1-max-gap", dest = "lvl1maxgap", type = int,
434 help = "maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default = 30 )
435 argparser_bdgbroadcall.add_argument( "-G", "--lvl2-max-gap", dest = "lvl2maxgap", type = int,
436 help = "maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default = 800)
437 argparser_bdgbroadcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
438 help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
439 add_outdir_option( argparser_bdgbroadcall )
440 add_output_group( argparser_bdgbroadcall )
441 return
443 def add_bdgcmp_parser( subparsers ):
444 """Add function 'peak calling on bedGraph' argument parsers.
446 argparser_bdgcmp = subparsers.add_parser( "bdgcmp",
447 help = "Deduct noise by comparing two signal tracks in bedGraph. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
448 argparser_bdgcmp.add_argument( "-t", "--tfile", dest = "tfile", type = str, required = True,
449 help = "Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED")
450 argparser_bdgcmp.add_argument( "-c", "--cfile", dest = "cfile", type = str, required = True,
451 help = "Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED")
452 argparser_bdgcmp.add_argument( "-S", "--scaling-factor", dest = "sfactor", type = float, default = 1.0,
453 help = "Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS2 callpeak, and plan to calculate scores as MACS2 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0")
454 argparser_bdgcmp.add_argument( "-p", "--pseudocount", dest = "pseudocount", type = float, default = 0.0,
455 help = "The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.")
457 argparser_bdgcmp.add_argument( "-m", "--method", dest = "method", type = str, nargs = "+",
458 choices = ( "ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max" ),
459 help = "Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois")
461 add_outdir_option( argparser_bdgcmp )
462 output_group = argparser_bdgcmp.add_mutually_exclusive_group( required = True )
463 output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
464 help = "The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile." )
465 output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = "+",
466 help = "Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m." )
467 return
469 def add_bdgopt_parser( subparsers ):
470 """Add function 'operations on score column of bedGraph' argument parsers.
471 """
472 argparser_bdgopt = subparsers.add_parser( "bdgopt",
473 help = "Operations on score column of bedGraph file. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
474 argparser_bdgopt.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
475 help = "MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED" )
476 argparser_bdgopt.add_argument( "-m", "--method", dest = "method", type = str,
477 choices = ( "multiply", "add", "p2q", "max", "min" ),
478 help = "Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS2. Any other types of score will cause unexpected errors.", default="p2q")
479 argparser_bdgopt.add_argument( "-p", "--extra-param", dest = "extraparam", type = float, nargs = "*",
480 help = "The extra parameter for METHOD. Check the detail of -m option.")
481 add_outdir_option( argparser_bdgopt )
482 argparser_bdgopt.add_argument( "-o", "--ofile", dest = "ofile", type = str,
483 help = "Output BEDGraph filename.", required = True )
484 return
486 def add_cmbreps_parser( subparsers ):
487 """Add function 'combine replicates' argument parsers.
489 argparser_cmbreps = subparsers.add_parser( "cmbreps",
490 help = "Combine BEDGraphs of scores from replicates. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
491 argparser_cmbreps.add_argument( "-i", dest = "ifile", type = str, required = True, nargs = "+",
492 help = "MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED" )
493 # argparser_cmbreps.add_argument( "-w", dest = "weights", type = float, nargs = "*",
494 # help = "Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE." )
495 argparser_cmbreps.add_argument( "-m", "--method", dest = "method", type = str,
496 choices = ( "fisher", "max", "mean" ),
497 help = "Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher")
498 add_outdir_option( argparser_cmbreps )
499 argparser_cmbreps.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True,
500 help = "Output BEDGraph filename for combined scores." )
501 return
503 def add_randsample_parser( subparsers ):
504 argparser_randsample = subparsers.add_parser( "randsample",
505 help = "Randomly sample number/percentage of total reads." )
506 argparser_randsample.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
507 help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
509 p_or_n_group = argparser_randsample.add_mutually_exclusive_group( required = True )
510 p_or_n_group.add_argument( "-p", "--percentage", dest = "percentage", type = float,
511 help = "Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. REQUIRED")
512 p_or_n_group.add_argument( "-n", "--number", dest = "number", type = float,
513 help = "Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED" )
514 argparser_randsample.add_argument( "--seed", dest = "seed", type = int, default = -1,
515 help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
516 argparser_randsample.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
517 help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
518 default = None)
519 add_outdir_option( argparser_randsample )
520 argparser_randsample.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
521 help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
522 argparser_randsample.add_argument( "-f", "--format", dest = "format", type = str,
523 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
524 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
525 default = "AUTO" )
526 argparser_randsample.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
527 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
529 argparser_randsample.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
530 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
531 return
533 def add_bdgdiff_parser( subparsers ):
534 argparser_bdgdiff = subparsers.add_parser( "bdgdiff",
535 help = "Differential peak detection based on paired four bedgraph files. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
536 argparser_bdgdiff.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
537 help = "MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
538 argparser_bdgdiff.add_argument( "--t2", dest="t2bdg", type = str, required = True,
539 help = "MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
540 argparser_bdgdiff.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
541 help = "MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
542 argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True,
543 help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
544 argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float,
545 help = "logLR cutoff. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
546 argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int,
547 help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 )
548 argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
549 help = "Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default = 100 )
550 argparser_bdgdiff.add_argument( "--d1", "--depth1", dest = "depth1", type = float, default = 1.0,
551 help = "Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1" )
552 argparser_bdgdiff.add_argument( "--d2", "--depth2", dest = "depth2", type = float, default = 1.0,
553 help = "Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1" )
555 add_outdir_option( argparser_bdgdiff )
556 output_group = argparser_bdgdiff.add_mutually_exclusive_group( required = True )
557 output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
558 help = "Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile." )
559 output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = 3,
560 help = "Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix." )
562 return
564 def add_refinepeak_parser( subparsers ):
565 argparser_refinepeak = subparsers.add_parser( "refinepeak",
566 help = "(Experimental) Take raw reads alignment, refine peak summits and give scores measuring balance of waston/crick tags. Inspired by SPP." )
567 argparser_refinepeak.add_argument( "-b", dest = "bedfile", type = str, required = True,
568 help = "Candidate peak file in BED format. REQUIRED." )
569 argparser_refinepeak.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
570 help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
571 argparser_refinepeak.add_argument( "-f", "--format", dest = "format", type = str,
572 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"),
573 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
574 default = "AUTO" )
575 argparser_refinepeak.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
576 help = "Cutoff DEFAULT: 5", default = 5 )
577 argparser_refinepeak.add_argument( "-w", "--window-size", dest= "windowsize", help = 'Scan window size on both side of the summit (default: 100bp)',
578 type = int, default = 200)
579 argparser_refinepeak.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
580 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
582 argparser_refinepeak.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
583 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
585 add_outdir_option( argparser_refinepeak )
586 add_output_group( argparser_refinepeak )
588 return
590 def add_predictd_parser( subparsers ):
591 """Add main function 'predictd' argument parsers.
593 argparser_predictd = subparsers.add_parser("predictd", help="Predict d or fragment size from alignment results. *Will NOT filter duplicates*")
595 # group for input files
596 argparser_predictd.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
597 help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
598 argparser_predictd.add_argument( "-f", "--format", dest = "format", type = str,
599 choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
600 "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
601 "BAMPE", "BEDPE"),
602 help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
603 default = "AUTO" )
604 argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
605 help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
606 argparser_predictd.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
607 help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
608 argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300,
609 help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300")
610 argparser_predictd.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
611 help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
612 argparser_predictd.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
613 help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50" )
615 add_outdir_option( argparser_predictd )
616 argparser_predictd.add_argument( "--rfile", dest = "rfile", type = str, default = "predictd",
617 help = "PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd' and R file will be predicted_model.R" )
618 argparser_predictd.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
619 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
621 argparser_predictd.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
622 help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
624 return
626 def add_pileup_parser( subparsers ):
627 argparser_pileup = subparsers.add_parser( "pileup",
628 help = "Pileup aligned reads with a given extension size (fragment size or d in MACS language). Note there will be no step for duplicate reads filtering or sequencing depth scaling, so you may need to do certain pre/post-processing." )
629 argparser_pileup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
630 help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
631 argparser_pileup.add_argument( "-o", "--ofile", dest = "outputfile", type = str, required = True,
632 help = "Output bedGraph file name. If not specified, will write to standard output. REQUIRED." )
633 add_outdir_option( argparser_pileup )
634 argparser_pileup.add_argument( "-f", "--format", dest = "format", type = str,
635 choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
636 help = "Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS2 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.",
637 default = "AUTO" )
638 argparser_pileup.add_argument( "-B", "--both-direction", dest = "bothdirection", action = "store_true", default = False,
639 help = "By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS2 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS2 way of piling up control reads. However MACS2 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False" )
641 argparser_pileup.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
642 help = "The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 " )
643 argparser_pileup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
644 help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
646 argparser_pileup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
647 help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
648 return
651 if __name__ == '__main__':
652 try:
653 main()
654 except KeyboardInterrupt:
655 sys.stderr.write("User interrupted me! ;-) Bye!\n")
656 sys.exit(0)