bin/macs2

   1 #!/usr/bin/env python
   2 # Time-stamp: <2019-09-25 10:24:38 taoliu>
   3
   4 """Description: MACS v2 main executable.
   5
   6 This code is free software; you can redistribute it and/or modify it
   7 under the terms of the BSD License (see the file LICENSE included with
   8 the distribution).
   9 """
  10
  11 # ------------------------------------
  12 # python modules
  13 # ------------------------------------
  14
  15 import os
  16 import sys
  17 import argparse as ap
  18 import tempfile
  19
  20 # ------------------------------------
  21 # own python modules
  22 # ------------------------------------
  23 from MACS2.Constants import *
  24
  25 # ------------------------------------
  26 # Main function
  27 # ------------------------------------
  28 def main():
  29     """The Main function/pipeline for MACS.
  30
  31     """
  32     # Parse options...
  33     argparser = prepare_argparser()
  34     args = argparser.parse_args()
  35
  36     subcommand  = args.subcommand
  37
  38     if args.outdir:
  39         # use a output directory to store MACS output
  40         if not os.path.exists( args.outdir ):
  41             try:
  42                 os.makedirs( args.outdir )
  43             except:
  44                 sys.exit( "Output directory (%s) could not be created. Terminating program." % args.outdir )
  45
  46
  47     if subcommand == "callpeak":
  48         # General call peak
  49         from MACS2.callpeak_cmd import run
  50         try:
  51             run( args )
  52         except MemoryError:
  53             sys.exit( "MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
  54     #elif subcommand == "diffpeak":
  55     #    # differential peak calling w/ bedgraphs + optional peak regions
  56     #    from MACS2.diffpeak_cmd import run
  57     #    run( args )
  58     elif subcommand == "bdgpeakcall":
  59         # call peak from bedGraph
  60         from MACS2.bdgpeakcall_cmd import run
  61         run( args )
  62     elif subcommand == "bdgbroadcall":
  63         # call broad peak from bedGraph
  64         from MACS2.bdgbroadcall_cmd import run
  65         run( args )
  66     elif subcommand == "bdgcmp":
  67         # compare treatment and control to make enrichment scores
  68         from MACS2.bdgcmp_cmd import run
  69         run( args )
  70     elif subcommand == "bdgopt":
  71         # operations on the score column of bedGraph file
  72         from MACS2.bdgopt_cmd import run
  73         run( args )
  74     elif subcommand == "cmbreps":
  75         # combine replicates
  76         from MACS2.cmbreps_cmd import run
  77         run( args )
  78     elif subcommand == "randsample":
  79         # randomly sample sequencing reads, and save as bed file
  80         from MACS2.randsample_cmd import run
  81         try:
  82             run( args )
  83         except MemoryError:
  84             sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
  85     elif subcommand == "filterdup":
  86         # filter out duplicate reads, and save as bed file
  87         from MACS2.filterdup_cmd import run
  88         try:
  89             run( args )
  90         except MemoryError:
  91             sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
  92     elif subcommand == "bdgdiff":
  93         # differential calling
  94         from MACS2.bdgdiff_cmd import run
  95         run( args )
  96     elif subcommand == "refinepeak":
  97         # refine peak summits
  98         from MACS2.refinepeak_cmd import run
  99         try:
 100             run( args )
 101         except MemoryError:
 102             sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
 103     elif subcommand == "predictd":
 104         # predict d or fragment size
 105         from MACS2.predictd_cmd import run
 106         try:
 107             run( args )
 108         except MemoryError:
 109             sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
 110     elif subcommand == "pileup":
 111         # pileup alignment results with a given extension method
 112         from MACS2.pileup_cmd import run
 113         try:
 114             run( args )
 115         except MemoryError:
 116             sys.exit( "!!!MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option." )
 117
 118 def prepare_argparser ():
 119     """Prepare optparser object. New options will be added in this
 120     function first.
 121
 122     """
 123     description = "%(prog)s -- Model-based Analysis for ChIP-Sequencing"
 124     epilog = "For command line options of each command, type: %(prog)s COMMAND -h"
 125     #Check community site: http://groups.google.com/group/macs-announcement/
 126     #Source code: https://github.com/taoliu/MACS/"
 127     # top-level parser
 128     argparser = ap.ArgumentParser( description = description, epilog = epilog ) #, usage = usage )
 129     argparser.add_argument("--version", action="version", version="%(prog)s "+MACS_VERSION)
 130     subparsers = argparser.add_subparsers( dest = 'subcommand' ) #help="sub-command help")
 131     subparsers.required = True
 132
 133     # command for 'callpeak'
 134     add_callpeak_parser( subparsers )
 135
 136     # # command for 'diffpeak'
 137     # add_diffpeak_parser( subparsers )
 138
 139     # command for 'bdgpeakcall'
 140     add_bdgpeakcall_parser( subparsers )
 141
 142     # command for 'bdgbroadcall'
 143     add_bdgbroadcall_parser( subparsers )
 144
 145     # command for 'bdgcmp'
 146     add_bdgcmp_parser( subparsers )
 147
 148     # command for 'bdgopt'
 149     add_bdgopt_parser( subparsers )
 150
 151     # command for 'cmbreps'
 152     add_cmbreps_parser( subparsers )
 153
 154     # command for 'bdgdiff'
 155     add_bdgdiff_parser( subparsers )
 156
 157     # command for 'filterdup'
 158     add_filterdup_parser( subparsers )
 159
 160     # command for 'predictd'
 161     add_predictd_parser( subparsers )
 162
 163     # command for 'pileup'
 164     add_pileup_parser( subparsers )
 165
 166     # command for 'randsample'
 167     add_randsample_parser( subparsers )
 168
 169     # command for 'refinepeak'
 170     add_refinepeak_parser( subparsers )
 171
 172     return argparser
 173
 174 def add_outdir_option ( parser ):
 175     parser.add_argument("--outdir", dest = "outdir", type = str, default = '',
 176                         help = "If specified all output files will be written to that directory. Default: the current working directory")
 177
 178 def add_output_group ( parser, required = True ):
 179     output_group = parser.add_mutually_exclusive_group( required = required )
 180     output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str,
 181                                help = "Output file name. Mutually exclusive with --o-prefix." )
 182     output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
 183                                help = "Output file prefix. Mutually exclusive with -o/--ofile." )
 184
 185 def add_callpeak_parser( subparsers ):
 186     """Add main function 'peak calling' argument parsers.
 187     """
 188     argparser_callpeak = subparsers.add_parser("callpeak", help="Main MACS2 Function: Call peaks from alignment results.")
 189
 190     # group for input files
 191     group_input = argparser_callpeak.add_argument_group( "Input files arguments" )
 192     group_input.add_argument( "-t", "--treatment", dest = "tfile", type = str, required = True, nargs = "+",
 193                               help = "ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED." )
 194     group_input.add_argument( "-c", "--control", dest = "cfile", type = str, nargs = "*",
 195                                     help = "Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.")
 196     group_input.add_argument( "-f", "--format", dest = "format", type = str,
 197                               choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
 198                                          "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
 199                                           "BAMPE", "BEDPE"),
 200                               help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS2 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
 201                               default = "AUTO" )
 202     group_input.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
 203                               help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
 204     group_input.add_argument( "-s", "--tsize",  dest = "tsize", type = int, default = None,
 205                                     help = "Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
 206     group_input.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "1",
 207                               help = "It controls the  behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS2 will still read them although the reads may be decided by MACS2 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS2 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1" )
 208
 209     # group for output files
 210     group_output = argparser_callpeak.add_argument_group( "Output arguments" )
 211     add_outdir_option( group_output )
 212     group_output.add_argument( "-n", "--name", dest = "name", type = str,
 213                                help = "Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
 214                                default = "NA" )
 215     group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
 216                                help = "Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False",
 217                                default = False )
 218     group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 219                                help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
 220     group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
 221                                help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
 222
 223     group_output.add_argument( "--SPMR", dest = "do_SPMR", action = "store_true", default = False,
 224                                help = "If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS2 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False" )
 225     # group for bimodal
 226     group_bimodal = argparser_callpeak.add_argument_group( "Shifting model arguments" )
 227     group_bimodal.add_argument( "--nomodel", dest = "nomodel", action = "store_true", default = False,
 228                                 help = "Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False" )
 229     group_bimodal.add_argument( "--shift", dest = "shift", type = int, default = 0,
 230                                 help = "(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. " )
 231     group_bimodal.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
 232                                 help = "The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option." )
 233     group_bimodal.add_argument( "--bw", dest = "bw", type = int, default = 300,
 234                                 help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300")
 235     group_bimodal.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
 236                                 help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
 237     group_bimodal.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
 238                                 help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50" )
 239
 240     group_bimodal.add_argument( "--fix-bimodal", dest = "onauto", action = "store_true",
 241                                 help = "Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False",
 242                                 default = False )
 243
 244     # General options.
 245     group_callpeak = argparser_callpeak.add_argument_group( "Peak calling arguments" )
 246     p_or_q_group = group_callpeak.add_mutually_exclusive_group()
 247     p_or_q_group.add_argument( "-q", "--qvalue", dest = "qvalue", type = float, default = 0.05,
 248                                help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive." )
 249     p_or_q_group.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
 250                                help = "Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file." )
 251
 252     # about scaling
 253     group_callpeak.add_argument( "--scale-to", dest = "scaleto", type = str, choices = ("large", "small"),
 254                                 help = "When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'." )
 255     group_callpeak.add_argument( "--down-sample", dest = "downsample", action = "store_true", default = False,
 256                                  help = "When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. <not implmented>If used together with --SPMR, 1 million unique reads will be randomly picked.</not implemented> Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False" )
 257     group_callpeak.add_argument( "--seed", dest = "seed", type = int, default = -1,
 258                                  help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
 259     group_callpeak.add_argument( "--tempdir", dest="tempdir", default=tempfile.gettempdir(),
 260                                 help = "Optional directory to store temp files. DEFAULT: %(default)s")
 261     group_callpeak.add_argument( "--nolambda", dest = "nolambda", action = "store_true",
 262                                  help = "If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ",
 263                                  default = False )
 264     group_callpeak.add_argument( "--slocal", dest = "smalllocal", type = int, default = 1000,
 265                                  help = "The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 " )
 266     group_callpeak.add_argument( "--llocal", dest = "largelocal", type = int, default = 10000,
 267                                  help = "The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000." )
 268     group_callpeak.add_argument( "--max-gap", dest = "maxgap", type = int,
 269                                  help = "Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size." )
 270     group_callpeak.add_argument( "--min-length", dest = "minlen", type = int,
 271                                  help = "Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and  check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value." )
 272     group_callpeak.add_argument( "--broad", dest = "broad", action = "store_true",
 273                                  help = "If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default = False )
 274     group_callpeak.add_argument( "--broad-cutoff", dest = "broadcutoff", type = float, default = 0.1,
 275                                  help = "Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS2 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 " )
 276     group_callpeak.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
 277                                  help = "While set, MACS2 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take  ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default = False )
 278
 279     # post-processing options
 280     group_postprocessing = argparser_callpeak.add_argument_group( "Post-processing options" )
 281     postprocess_group = group_postprocessing.add_mutually_exclusive_group()
 282
 283     postprocess_group.add_argument( "--call-summits", dest="call_summits", action="store_true",
 284                                     help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False",default=False)
 285     # postprocess_group.add_argument( "--refine-peaks", dest="refine_peaks", action="store_true",
 286     #                                 help="If set, MACS will refine peak summits by measuring balance of waston/crick tags. Those peaks without balancing tags will be disgarded. Peak summits will be redefined and reassgined with scores. Note, duplicate reads will be put back while calculating read balance. And more memory will be used. Default: False", default=False )
 287     group_postprocessing.add_argument( "--fe-cutoff", dest="fecutoff", type=float, default = 1.0,
 288                                        help = "When set, the value will be used to filter out peaks with low fold-enrichment. Note, MACS2 use 1.0 as pseudocount while calculating fold-enrichment. DEFAULT: 1.0")
 289
 290     # other options
 291     group_other = argparser_callpeak.add_argument_group( "Other options" )
 292     group_other.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
 293                                   help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
 294
 295     # obsolete options
 296     group_obsolete = argparser_callpeak.add_argument_group( "Obsolete options" )
 297     group_obsolete.add_argument( "--to-large", dest = "tolarge", action = "store_true", default = False,
 298                                      help = "Obsolete option. Please use '--scale-to large' instead." )
 299     group_obsolete.add_argument( "--ratio", dest = "ratio", type = float, default = 1.0,
 300                                      help = "Obsolete option. Originally designed to normalize treatment and control with customized ratio, now it won't have any effect." )
 301     return
 302
 303 def add_diffpeak_parser( subparsers ):
 304     """Add main function 'peak calling' argument parsers.
 305     """
 306     argparser_diffpeak = subparsers.add_parser("diffpeak", help="MACS2 Differential Peak Function: Call peaks from bedgraphs (or use optional peak regions) and determine peaks of differential occupancy")
 307
 308
 309     # group for input files
 310     group_input = argparser_diffpeak.add_argument_group( "Input files arguments" )
 311     group_input.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
 312                                     help = "MACS pileup bedGraph for condition 1. REQUIRED" )
 313     group_input.add_argument( "--t2", dest="t2bdg", type = str, required = True,
 314                                     help = "MACS pileup bedGraph for condition 2. REQUIRED" )
 315     group_input.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
 316                                     help = "MACS control lambda bedGraph for condition 1. REQUIRED" )
 317     group_input.add_argument( "--c2", dest="c2bdg", type = str, required = True,
 318                                     help = "MACS control lambda bedGraph for condition 2. REQUIRED" )
 319     group_input.add_argument( "--peaks1", dest = "peaks1", type = str, default='',
 320                                     help = "MACS peaks.xls file for condition 1. Optional but must specify peaks2 if present" )
 321     group_input.add_argument( "--peaks2", dest="peaks2", type = str, default='',
 322                                     help = "MACS peaks.xls file for condition 2. Optional but must specify peaks1 if present" )
 323     group_input.add_argument( "-d", "--depth-multiplier", dest = "depth", type = float, default = [1.0], nargs = "+",
 324                                     help = "Sequence depth in million reads. If two depths are different, use '-d X -d Y' for X million reads in condition 1 and Y million reads in condition 2. If they are same, use '-d X' for X million reads in both condition 1 and condition 2 (e.g. the bedGraph files are from 'callpeak --SPMR'). Default: 1 (if you use 'macs2 callpeak --SPMR' to generate bdg files, we recommend using the smaller depth as a multiplier)" )
 325 #    group_input.add_argument( "-f", "--format", dest = "format", type = str,
 326 #                              choices = ("AUTO", "BED", "XLS"),
 327 #                              help = "Format of peak regions file, \"AUTO\", \"BED\" or \"XLS\". The default AUTO option will let MACS decide which format the file is based on the file extension. DEFAULT: \"AUTO\"",
 328 #                              default = "AUTO" )
 329
 330     # group for output files
 331     group_output = argparser_diffpeak.add_argument_group( "Output arguments" )
 332     add_outdir_option( group_output )
 333     group_output.add_argument( "-n", "--name", dest = "name", type = str,
 334                                help = "Experiment name, which will be used to generate output file names. DEFAULT: \"diffpeak\"",
 335                                default = "diffpeak" )
 336     group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
 337                                help = "Whether or not to save basewise p/qvalues from every peak region into a bedGraph file. DEFAULT: False",
 338                                default = False )
 339     group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 340                                help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
 341     group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
 342                                help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
 343
 344     # General options.
 345     group_diffpeak = argparser_diffpeak.add_argument_group( "Peak calling arguments" )
 346     p_or_q_group = group_diffpeak.add_mutually_exclusive_group()
 347     p_or_q_group.add_argument( "-q", "--qvalue", dest = "diff_qvalue", type = float, default = 0.05,
 348                                help = "Minimum FDR (q-value) cutoff for differences. DEFAULT: 0.05. -q and -p are mutually exclusive." )
 349     p_or_q_group.add_argument( "-p", "--pvalue", dest = "diff_pvalue", type = float,
 350                                help = "Pvalue cutoff for differences. DEFAULT: not set. -q and -p are mutually exclusive." )
 351     p_or_q_group2 = group_diffpeak.add_mutually_exclusive_group()
 352     p_or_q_group2.add_argument( "--peaks-qvalue", dest = "peaks_qvalue", type = float, default = 0.05,
 353                                help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
 354     p_or_q_group2.add_argument( "--peaks-pvalue", dest = "peaks_pvalue", type = float,
 355                                help = "Pvalue cutoff for peak detection. DEFAULT: not set. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
 356     group_diffpeak.add_argument( "-m", "--peak-min-len", dest = "pminlen", type = int,
 357                                     help = "Minimum length of peak regions. DEFAULT: 200", default = 200 )
 358     group_diffpeak.add_argument( "--diff-min-len", dest = "dminlen", type = int,
 359                                     help = "Minimum length of differential region (must overlap a valid peak). DEFAULT: 50", default = 100 )
 360     group_diffpeak.add_argument( "--ignore-duplicate-peaks", dest="ignore_duplicate_peaks", action="store_false",
 361                          help="If set, MACS will ignore duplicate regions with identical coordinates. Helpful if --call-summits was set. DEFAULT: True",default=True)
 362     return
 363
 364 def add_filterdup_parser( subparsers ):
 365     argparser_filterdup = subparsers.add_parser( "filterdup",
 366                                                  help = "Remove duplicate reads at the same position, then save the rest alignments to BED or BEDPE file. If you use '--keep-dup all option', this script can be utilized to convert any acceptable format into BED or BEDPE format." )
 367     argparser_filterdup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
 368                                       help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
 369     argparser_filterdup.add_argument( "-f", "--format", dest = "format", type = str,
 370                                       choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
 371                                       help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
 372                                       default = "AUTO" )
 373     argparser_filterdup.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
 374                                       help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), DEFAULT:hs" )
 375     argparser_filterdup.add_argument( "-s", "--tsize", dest = "tsize", type = int,
 376                                       help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set" )
 377     argparser_filterdup.add_argument( "-p", "--pvalue", dest = "pvalue", type = float,
 378                                       help = "Pvalue cutoff for binomial distribution test. DEFAULT:1e-5" )
 379     argparser_filterdup.add_argument( "--keep-dup", dest = "keepduplicates", type = str, default = "auto",
 380                                       help = "It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS2 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS2 will still read them although the reads may be decided by MACS2 as duplicate later. Default: auto" )
 381     argparser_filterdup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
 382                                       help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
 383
 384     argparser_filterdup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 385                                       help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
 386     add_outdir_option( argparser_filterdup )
 387     argparser_filterdup.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
 388                                       help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
 389                                       default = "stdout" )
 390     argparser_filterdup.add_argument( "-d", "--dry-run", dest="dryrun", action="store_true", default=False,
 391                                       help = "When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set" )
 392     return
 393
 394 def add_bdgpeakcall_parser( subparsers ):
 395     """Add function 'peak calling on bedGraph' argument parsers.
 396     """
 397     argparser_bdgpeakcall = subparsers.add_parser( "bdgpeakcall",
 398                                                    help = "Call peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
 399     argparser_bdgpeakcall.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
 400                                         help = "MACS score in bedGraph. REQUIRED" )
 401     argparser_bdgpeakcall.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
 402                                         help = "Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS2, score 5 means pvalue 1e-5. DEFAULT: 5", default = 5 )
 403     argparser_bdgpeakcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
 404                                        help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
 405     argparser_bdgpeakcall.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
 406                                        help = "maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default = 30 )
 407     argparser_bdgpeakcall.add_argument( "--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False)
 408 #                         help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False)
 409     argparser_bdgpeakcall.add_argument( "--cutoff-analysis", dest="cutoff_analysis", action="store_true",
 410                                         help = "While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default = False )
 411     argparser_bdgpeakcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
 412                          help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
 413
 414     add_outdir_option( argparser_bdgpeakcall )
 415     add_output_group( argparser_bdgpeakcall )
 416
 417     return
 418
 419 def add_bdgbroadcall_parser( subparsers ):
 420     """Add function 'broad peak calling on bedGraph' argument parsers.
 421     """
 422     argparser_bdgbroadcall = subparsers.add_parser( "bdgbroadcall",
 423                                                     help = "Call broad peaks from bedGraph output. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
 424     argparser_bdgbroadcall.add_argument( "-i", "--ifile", dest = "ifile" , type = str, required = True,
 425                                          help = "MACS score in bedGraph. REQUIRED" )
 426     argparser_bdgbroadcall.add_argument( "-c", "--cutoff-peak", dest = "cutoffpeak", type = float,
 427                                          help = "Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 2 means qvalue 0.01. DEFAULT: 2",
 428                                          default = 2 )
 429     argparser_bdgbroadcall.add_argument( "-C", "--cutoff-link", dest = "cutofflink", type = float,
 430                                          help = "Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS2, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default = 1 )
 431     argparser_bdgbroadcall.add_argument( "-l", "--min-length", dest = "minlen", type = int,
 432                                          help = "minimum length of peak, better to set it as d value. DEFAULT: 200", default = 200 )
 433     argparser_bdgbroadcall.add_argument( "-g", "--lvl1-max-gap", dest = "lvl1maxgap", type = int,
 434                                          help = "maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default = 30 )
 435     argparser_bdgbroadcall.add_argument( "-G", "--lvl2-max-gap", dest = "lvl2maxgap", type = int,
 436                                          help = "maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default = 800)
 437     argparser_bdgbroadcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
 438                                          help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
 439     add_outdir_option( argparser_bdgbroadcall )
 440     add_output_group( argparser_bdgbroadcall )
 441     return
 442
 443 def add_bdgcmp_parser( subparsers ):
 444     """Add function 'peak calling on bedGraph' argument parsers.
 445     """
 446     argparser_bdgcmp = subparsers.add_parser( "bdgcmp",
 447                                               help = "Deduct noise by comparing two signal tracks in bedGraph. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
 448     argparser_bdgcmp.add_argument( "-t", "--tfile", dest = "tfile", type = str, required = True,
 449                                    help = "Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED")
 450     argparser_bdgcmp.add_argument( "-c", "--cfile", dest = "cfile", type = str, required = True,
 451                                    help = "Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED")
 452     argparser_bdgcmp.add_argument( "-S", "--scaling-factor", dest = "sfactor", type = float, default = 1.0,
 453                                    help = "Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS2 callpeak, and plan to calculate scores as MACS2 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0")
 454     argparser_bdgcmp.add_argument( "-p", "--pseudocount", dest = "pseudocount", type = float, default = 0.0,
 455                                    help = "The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.")
 456
 457     argparser_bdgcmp.add_argument( "-m", "--method", dest = "method", type = str, nargs = "+",
 458                                    choices = ( "ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max" ),
 459                                    help = "Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, logLR, and slogLR. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois")
 460
 461     add_outdir_option( argparser_bdgcmp )
 462     output_group = argparser_bdgcmp.add_mutually_exclusive_group( required = True )
 463     output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
 464                                help = "The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile." )
 465     output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = "+",
 466                                help = "Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m." )
 467     return
 468
 469 def add_bdgopt_parser( subparsers ):
 470     """Add function 'operations on score column of bedGraph' argument parsers.
 471     """
 472     argparser_bdgopt = subparsers.add_parser( "bdgopt",
 473                                               help = "Operations on score column of bedGraph file. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
 474     argparser_bdgopt.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True,
 475                                    help = "MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED" )
 476     argparser_bdgopt.add_argument( "-m", "--method", dest = "method", type = str,
 477                                    choices = ( "multiply", "add", "p2q", "max", "min" ),
 478                                    help = "Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS2. Any other types of score will cause unexpected errors.", default="p2q")
 479     argparser_bdgopt.add_argument( "-p", "--extra-param", dest = "extraparam", type = float, nargs = "*",
 480                                    help = "The extra parameter for METHOD. Check the detail of -m option.")
 481     add_outdir_option( argparser_bdgopt )
 482     argparser_bdgopt.add_argument( "-o", "--ofile", dest = "ofile", type = str,
 483                                    help = "Output BEDGraph filename.", required = True )
 484     return
 485
 486 def add_cmbreps_parser( subparsers ):
 487     """Add function 'combine replicates' argument parsers.
 488     """
 489     argparser_cmbreps = subparsers.add_parser( "cmbreps",
 490                                                help = "Combine BEDGraphs of scores from replicates. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
 491     argparser_cmbreps.add_argument( "-i", dest = "ifile", type = str, required = True, nargs = "+",
 492                                     help = "MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED" )
 493     # argparser_cmbreps.add_argument( "-w", dest = "weights", type = float, nargs = "*",
 494     #                                 help = "Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE." )
 495     argparser_cmbreps.add_argument( "-m", "--method", dest = "method", type = str,
 496                                     choices = ( "fisher", "max", "mean" ),
 497                                     help = "Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher")
 498     add_outdir_option( argparser_cmbreps )
 499     argparser_cmbreps.add_argument( "-o", "--ofile", dest = "ofile", type = str, required = True,
 500                                     help = "Output BEDGraph filename for combined scores." )
 501     return
 502
 503 def add_randsample_parser( subparsers ):
 504     argparser_randsample = subparsers.add_parser( "randsample",
 505                                                   help = "Randomly sample number/percentage of total reads." )
 506     argparser_randsample.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
 507                                       help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
 508
 509     p_or_n_group = argparser_randsample.add_mutually_exclusive_group( required = True )
 510     p_or_n_group.add_argument( "-p", "--percentage", dest = "percentage", type = float,
 511                                help = "Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. REQUIRED")
 512     p_or_n_group.add_argument( "-n", "--number", dest = "number", type = float,
 513                                help = "Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED" )
 514     argparser_randsample.add_argument( "--seed", dest = "seed", type = int, default = -1,
 515                                        help = "Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set" )
 516     argparser_randsample.add_argument( "-o", "--ofile", dest = "outputfile", type = str,
 517                                       help = "Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
 518                                       default = None)
 519     add_outdir_option( argparser_randsample )
 520     argparser_randsample.add_argument( "-s", "--tsize", dest = "tsize", type = int, default = None,
 521                                        help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
 522     argparser_randsample.add_argument( "-f", "--format", dest = "format", type = str,
 523                                        choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
 524                                        help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
 525                                        default = "AUTO" )
 526     argparser_randsample.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
 527                                        help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
 528
 529     argparser_randsample.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 530                                        help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
 531     return
 532
 533 def add_bdgdiff_parser( subparsers ):
 534     argparser_bdgdiff = subparsers.add_parser( "bdgdiff",
 535                                                help = "Differential peak detection based on paired four bedgraph files. Note: All regions on the same chromosome in the bedGraph file should be continuous so only bedGraph files from MACS2 are accpetable." )
 536     argparser_bdgdiff.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
 537                                     help = "MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
 538     argparser_bdgdiff.add_argument( "--t2", dest="t2bdg", type = str, required = True,
 539                                     help = "MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
 540     argparser_bdgdiff.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
 541                                     help = "MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED" )
 542     argparser_bdgdiff.add_argument( "--c2", dest="c2bdg", type = str, required = True,
 543                                     help = "MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED" )
 544     argparser_bdgdiff.add_argument( "-C", "--cutoff", dest = "cutoff", type = float,
 545                                     help = "logLR cutoff. DEFAULT: 3 (likelihood ratio=1000)", default = 3 )
 546     argparser_bdgdiff.add_argument( "-l", "--min-len", dest = "minlen", type = int,
 547                                     help = "Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default = 200 )
 548     argparser_bdgdiff.add_argument( "-g", "--max-gap", dest = "maxgap", type = int,
 549                                     help = "Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default = 100 )
 550     argparser_bdgdiff.add_argument( "--d1", "--depth1", dest = "depth1", type = float, default = 1.0,
 551                                     help = "Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1" )
 552     argparser_bdgdiff.add_argument( "--d2", "--depth2", dest = "depth2", type = float, default = 1.0,
 553                                     help = "Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1" )
 554
 555     add_outdir_option( argparser_bdgdiff )
 556     output_group = argparser_bdgdiff.add_mutually_exclusive_group( required = True )
 557     output_group.add_argument( "--o-prefix", dest = "oprefix", type = str,
 558                                help = "Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile." )
 559     output_group.add_argument( "-o", "--ofile", dest = "ofile", type = str, nargs = 3,
 560                                help = "Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix." )
 561
 562     return
 563
 564 def add_refinepeak_parser( subparsers ):
 565     argparser_refinepeak = subparsers.add_parser( "refinepeak",
 566                                                  help = "(Experimental) Take raw reads alignment, refine peak summits and give scores measuring balance of waston/crick tags. Inspired by SPP." )
 567     argparser_refinepeak.add_argument( "-b", dest = "bedfile", type = str, required = True,
 568                                       help = "Candidate peak file in BED format. REQUIRED." )
 569     argparser_refinepeak.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
 570                                       help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
 571     argparser_refinepeak.add_argument( "-f", "--format", dest = "format", type = str,
 572                                       choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE"),
 573                                       help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
 574                                       default = "AUTO" )
 575     argparser_refinepeak.add_argument( "-c", "--cutoff" , dest = "cutoff", type = float,
 576                                         help = "Cutoff  DEFAULT: 5", default = 5 )
 577     argparser_refinepeak.add_argument( "-w", "--window-size", dest= "windowsize", help = 'Scan window size on both side of the summit (default: 100bp)',
 578                                         type = int, default = 200)
 579     argparser_refinepeak.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
 580                                        help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
 581
 582     argparser_refinepeak.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 583                                        help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
 584
 585     add_outdir_option( argparser_refinepeak )
 586     add_output_group( argparser_refinepeak )
 587
 588     return
 589
 590 def add_predictd_parser( subparsers ):
 591     """Add main function 'predictd' argument parsers.
 592     """
 593     argparser_predictd = subparsers.add_parser("predictd", help="Predict d or fragment size from alignment results. *Will NOT filter duplicates*")
 594
 595     # group for input files
 596     argparser_predictd.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
 597                                      help = "ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
 598     argparser_predictd.add_argument( "-f", "--format", dest = "format", type = str,
 599                                      choices = ("AUTO", "BAM", "SAM", "BED", "ELAND",
 600                                                 "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
 601                                                 "BAMPE", "BEDPE"),
 602                                      help = "Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
 603                                      default = "AUTO" )
 604     argparser_predictd.add_argument( "-g", "--gsize", dest = "gsize", type = str, default = "hs",
 605                                      help = "Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2.7e9), 'mm' for mouse (1.87e9), 'ce' for C. elegans (9e7) and 'dm' for fruitfly (1.2e8), Default:hs" )
 606     argparser_predictd.add_argument( "-s", "--tsize",  dest = "tsize", type = int, default = None,
 607                                      help = "Tag size. This will override the auto detected tag size. DEFAULT: Not set")
 608     argparser_predictd.add_argument( "--bw", dest = "bw", type = int, default = 300,
 609                                      help = "Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300")
 610     argparser_predictd.add_argument( "--d-min", dest = "d_min", type = int, default = 20,
 611                                      help = "Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
 612     argparser_predictd.add_argument( "-m", "--mfold", dest = "mfold", type = int, default = [5,50], nargs = 2,
 613                                      help = "Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50" )
 614
 615     add_outdir_option( argparser_predictd )
 616     argparser_predictd.add_argument( "--rfile", dest = "rfile", type = str, default = "predictd",
 617                                      help = "PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd' and R file will be predicted_model.R" )
 618     argparser_predictd.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
 619                                      help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
 620
 621     argparser_predictd.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 622                                      help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
 623
 624     return
 625
 626 def add_pileup_parser( subparsers ):
 627     argparser_pileup = subparsers.add_parser( "pileup",
 628                                               help = "Pileup aligned reads with a given extension size (fragment size or d in MACS language). Note there will be no step for duplicate reads filtering or sequencing depth scaling, so you may need to do certain pre/post-processing." )
 629     argparser_pileup.add_argument( "-i", "--ifile", dest = "ifile", type = str, required = True, nargs = "+",
 630                                    help = "Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED." )
 631     argparser_pileup.add_argument( "-o", "--ofile", dest = "outputfile", type = str, required = True,
 632                                    help = "Output bedGraph file name. If not specified, will write to standard output. REQUIRED." )
 633     add_outdir_option( argparser_pileup )
 634     argparser_pileup.add_argument( "-f", "--format", dest = "format", type = str,
 635                                    choices=("AUTO","BAM","SAM","BED","ELAND","ELANDMULTI","ELANDEXPORT","BOWTIE","BAMPE","BEDPE"),
 636                                    help = "Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS2 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.",
 637                                    default = "AUTO" )
 638     argparser_pileup.add_argument( "-B", "--both-direction", dest = "bothdirection", action = "store_true", default = False,
 639                                    help = "By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS2 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS2 way of piling up control reads. However MACS2 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False" )
 640
 641     argparser_pileup.add_argument( "--extsize", dest = "extsize", type = int, default = 200,
 642                                    help = "The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 " )
 643     argparser_pileup.add_argument( "--buffer-size", dest = "buffer_size", type = int, default = "100000",
 644                                    help = "Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 " )
 645
 646     argparser_pileup.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
 647                                       help = "Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2" )
 648     return
 649
 650
 651 if __name__ == '__main__':
 652     try:
 653         main()
 654     except KeyboardInterrupt:
 655         sys.stderr.write("User interrupted me! ;-) Bye!\n")
 656         sys.exit(0)