bin/macs3

   1 #!/usr/bin/env python
   2 # Time-stamp: <2024-10-02 12:29:58 Tao Liu>
   3
   4 """Description: MACS v3 main executable.
   5
   6 This code is free software; you can redistribute it and/or modify it
   7 under the terms of the BSD License (see the file LICENSE included with
   8 the distribution).
   9 """
  10
  11 # ------------------------------------
  12 # python modules
  13 # ------------------------------------
  14
  15 import os
  16 import sys
  17 import argparse as ap
  18 import tempfile
  19
  20 # ------------------------------------
  21 # own python modules
  22 # ------------------------------------
  23 from MACS3.Utilities.Constants import MACS_VERSION
  24
  25 # ------------------------------------
  26 # Main function
  27 # ------------------------------------
  28
  29
  30 def main():
  31     """The Main function/pipeline for MACS.
  32
  33     """
  34     # Parse options...
  35     argparser = prepare_argparser()
  36     args = argparser.parse_args()
  37
  38     subcommand = args.subcommand
  39
  40     if args.outdir:
  41         # use a output directory to store MACS output
  42         if not os.path.exists(args.outdir):
  43             try:
  44                 os.makedirs(args.outdir)
  45             except FileExistsError:
  46                 sys.exit("Output directory (%s) could not be created since it already exists. Terminate program." % args.outdir)
  47             except PermissionError:
  48                 sys.exit("Output directory (%s) could not be created due to permission. Terminate program." % args.outdir)
  49             except Exception:
  50                 sys.exit("Output directory (%s) could not be created. Terminate program." % args.outdir)
  51
  52     if subcommand == "callpeak":
  53         # General call peak
  54         from MACS3.Commands.callpeak_cmd import run
  55         run(args)
  56     elif subcommand == "bdgpeakcall":
  57         # call peak from bedGraph
  58         from MACS3.Commands.bdgpeakcall_cmd import run
  59         run(args)
  60     elif subcommand == "bdgbroadcall":
  61         # call broad peak from bedGraph
  62         from MACS3.Commands.bdgbroadcall_cmd import run
  63         run(args)
  64     elif subcommand == "bdgcmp":
  65         # compare treatment and control to make enrichment scores
  66         from MACS3.Commands.bdgcmp_cmd import run
  67         run(args)
  68     elif subcommand == "bdgopt":
  69         # operations on the score column of bedGraph file
  70         from MACS3.Commands.bdgopt_cmd import run
  71         run(args)
  72     elif subcommand == "cmbreps":
  73         # combine replicates
  74         from MACS3.Commands.cmbreps_cmd import run
  75         run(args)
  76     elif subcommand == "randsample":
  77         # randomly sample sequencing reads, and save as bed file
  78         from MACS3.Commands.randsample_cmd import run
  79         run(args)
  80     elif subcommand == "filterdup":
  81         # filter out duplicate reads, and save as bed file
  82         from MACS3.Commands.filterdup_cmd import run
  83         run(args)
  84     elif subcommand == "bdgdiff":
  85         # differential calling
  86         from MACS3.Commands.bdgdiff_cmd import run
  87         run(args)
  88     elif subcommand == "refinepeak":
  89         # refine peak summits
  90         from MACS3.Commands.refinepeak_cmd import run
  91         run(args)
  92     elif subcommand == "predictd":
  93         # predict d or fragment size
  94         from MACS3.Commands.predictd_cmd import run
  95         run(args)
  96     elif subcommand == "pileup":
  97         # pileup alignment results with a given extension method
  98         from MACS3.Commands.pileup_cmd import run
  99         run(args)
 100     elif subcommand == "hmmratac":
 101         # use HMMRATAC algorithm to call ATAC-seq peaks
 102         from MACS3.Commands.hmmratac_cmd import run
 103         run(args)
 104     elif subcommand == "callvar":
 105         # assemble reads in peak region and call variants
 106         from MACS3.Commands.callvar_cmd import run
 107         run(args)
 108
 109
 110 def prepare_argparser():
 111     """Prepare optparser object. New options will be added in this
 112     function first.
 113
 114     """
 115     description = "%(prog)s -- Model-based Analysis for ChIP-Sequencing"
 116     epilog = "For command line options of each command, type: %(prog)s COMMAND -h"
 117     # top-level parser
 118     argparser = ap.ArgumentParser(description=description, epilog=epilog)
 119     argparser.add_argument("--version", action="version", version="%(prog)s " + MACS_VERSION)
 120     subparsers = argparser.add_subparsers(dest='subcommand')
 121     subparsers.required = True
 122
 123     # command for 'callpeak'
 124     add_callpeak_parser(subparsers)
 125
 126     # command for 'bdgpeakcall'
 127     add_bdgpeakcall_parser(subparsers)
 128
 129     # command for 'bdgbroadcall'
 130     add_bdgbroadcall_parser(subparsers)
 131
 132     # command for 'bdgcmp'
 133     add_bdgcmp_parser(subparsers)
 134
 135     # command for 'bdgopt'
 136     add_bdgopt_parser(subparsers)
 137
 138     # command for 'cmbreps'
 139     add_cmbreps_parser(subparsers)
 140
 141     # command for 'bdgdiff'
 142     add_bdgdiff_parser(subparsers)
 143
 144     # command for 'filterdup'
 145     add_filterdup_parser(subparsers)
 146
 147     # command for 'predictd'
 148     add_predictd_parser(subparsers)
 149
 150     # command for 'pileup'
 151     add_pileup_parser(subparsers)
 152
 153     # command for 'randsample'
 154     add_randsample_parser(subparsers)
 155
 156     # command for 'refinepeak'
 157     add_refinepeak_parser(subparsers)
 158
 159     # command for 'callvar'
 160     add_callvar_parser(subparsers)
 161
 162     # command for 'hmmratac'
 163     add_hmmratac_parser(subparsers)
 164
 165     return argparser
 166
 167
 168 def add_outdir_option(parser):
 169     parser.add_argument("--outdir", dest="outdir", type=str, default='',
 170                         help="If specified all output files will be written to that directory. Default: the current working directory")
 171
 172
 173 def add_output_group(parser, required=True):
 174     output_group = parser.add_mutually_exclusive_group(required=required)
 175     output_group.add_argument("-o", "--ofile", dest="ofile", type=str,
 176                               help="Output file name. Mutually exclusive with --o-prefix.")
 177     output_group.add_argument("--o-prefix", dest="oprefix", type=str,
 178                               help="Output file prefix. Mutually exclusive with -o/--ofile.")
 179
 180
 181 def add_callpeak_parser(subparsers):
 182     """Add main function 'peak calling' argument parsers.
 183     """
 184     argparser_callpeak = subparsers.add_parser("callpeak", help="Main MACS3 Function: Call peaks from alignment results.",
 185                                                formatter_class=ap.RawDescriptionHelpFormatter,
 186                                                epilog="""Examples:
 187 1. Peak calling for regular TF ChIP-seq:
 188     $ macs3 callpeak -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01
 189 2. Broad peak calling on Histone Mark ChIP-seq:
 190     $ macs3 callpeak -t ChIP.bam -c Control.bam --broad -g hs --broad-cutoff 0.1
 191 3. Peak calling on ATAC-seq (paired-end mode):
 192     $ macs3 callpeak -f BAMPE -t ATAC.bam -g hs -n test -B -q 0.01
 193 4. Peak calling on ATAC-seq (focusing on insertion sites, and using single-end mode):
 194     $ macs3 callpeak -f BAM -t ATAC.bam -g hs -n test -B -q 0.01 --shift -50 --extension 100
 195 """)
 196
 197     # group for input files
 198     group_input = argparser_callpeak.add_argument_group("Input files arguments")
 199     group_input.add_argument("-t", "--treatment", dest="tfile", type=str, required=True, nargs="+",
 200                              help="ChIP-seq treatment file. If multiple files are given as '-t A B C', then they will all be read and pooled together. REQUIRED.")
 201     group_input.add_argument("-c", "--control", dest="cfile", type=str, nargs="*",
 202                              help="Control file. If multiple files are given as '-c A B C', they will be pooled to estimate ChIP-seq background noise.")
 203     group_input.add_argument("-f", "--format", dest="format", type=str,
 204                              choices=("AUTO", "BAM", "SAM", "BED", "ELAND",
 205                                       "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
 206                                       "BAMPE", "BEDPE"),
 207                              help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format (except for BAMPE and BEDPE which should be implicitly set) the file is. Please check the definition in README. Please note that if the format is set as BAMPE or BEDPE, MACS3 will call its special Paired-end mode to call peaks by piling up the actual ChIPed fragments defined by both aligned ends, instead of predicting the fragment size first and extending reads. Also please note that the BEDPE only contains three columns, and is NOT the same BEDPE format used by BEDTOOLS. DEFAULT: \"AUTO\"",
 208                              default="AUTO")
 209     group_input.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs",
 210                              help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
 211     group_input.add_argument("-s", "--tsize",  dest="tsize", type=int, default=None,
 212                              help="Tag size/read length. This will override the auto detected tag size. DEFAULT: Not set")
 213     group_input.add_argument("--keep-dup", dest="keepduplicates", type=str, default="1",
 214                              help="It controls the  behavior towards duplicate tags at the exact same location -- the same coordination and the same strand. The 'auto' option makes MACS calculate the maximum tags at the exact same location based on binomal distribution using 1e-5 as pvalue cutoff; and the 'all' option keeps every tags. If an integer is given, at most this number of tags will be kept at the same location. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. If you plan to rely on samtools/picard/any other tool to filter duplicates, please remove those duplicate reads and save a new alignment file then ask MACS3 to keep all by '--keep-dup all'. The default is to keep one tag at the same location. Default: 1")
 215
 216     # group for output files
 217     group_output = argparser_callpeak.add_argument_group("Output arguments")
 218     add_outdir_option(group_output)
 219     group_output.add_argument("-n", "--name", dest="name", type=str,
 220                               help="Experiment name, which will be used to generate output file names. DEFAULT: \"NA\"",
 221                               default="NA")
 222     group_output.add_argument("-B", "--bdg", dest="store_bdg", action="store_true",
 223                               help="Whether or not to save extended fragment pileup, and local lambda tracks (two files) at every bp into a bedGraph file. DEFAULT: False",
 224                               default=False)
 225     group_output.add_argument("--verbose", dest="verbose", type=int, default=2,
 226                               help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 227     group_output.add_argument("--trackline", dest="trackline", action="store_true", default=False,
 228                               help="Instruct MACS to include trackline in the header of output files, including the bedGraph, narrowPeak, gappedPeak, BED format files. To include this trackline is necessary while uploading them to the UCSC genome browser. You can also mannually add these trackline to corresponding output files. For example, in order to upload narrowPeak file to UCSC browser, add this to as the first line -- `track type=narrowPeak name=\"my_peaks\" description=\"my peaks\"`. Default: Not to include trackline.")
 229
 230     group_output.add_argument("--SPMR", dest="do_SPMR", action="store_true", default=False,
 231                               help="If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS3 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False")
 232     # group for bimodal
 233     group_bimodal = argparser_callpeak.add_argument_group("Shifting model arguments")
 234     group_bimodal.add_argument("--nomodel", dest="nomodel", action="store_true", default=False,
 235                                help="Whether or not to build the shifting model. If True, MACS will not build model. by default it means shifting size = 100, try to set extsize to change it. It's highly recommended that while you have many datasets to process and you plan to compare different conditions, aka differential calling, use both 'nomodel' and 'extsize' to make signal files from different datasets comparable. DEFAULT: False")
 236     group_bimodal.add_argument("--shift", dest="shift", type=int, default=0,
 237                                help="(NOT the legacy --shiftsize option!) The arbitrary shift in bp. Use discretion while setting it other than default value. When NOMODEL is set, MACS will use this value to move cutting ends (5') towards 5'->3' direction then apply EXTSIZE to extend them to fragments. When this value is negative, ends will be moved toward 3'->5' direction. Recommended to keep it as default 0 for ChIP-Seq datasets, or -1 * half of EXTSIZE together with EXTSIZE option for detecting enriched cutting loci such as certain DNAseI-Seq datasets. Note, you can't set values other than 0 if format is BAMPE or BEDPE for paired-end data. DEFAULT: 0. ")
 238     group_bimodal.add_argument("--extsize", dest="extsize", type=int, default=200,
 239                                help="The arbitrary extension size in bp. When nomodel is true, MACS will use this value as fragment size to extend each read towards 3' end, then pile them up. It's exactly twice the number of obsolete SHIFTSIZE. In previous language, each read is moved 5'->3' direction to middle of fragment by 1/2 d, then extended to both direction with 1/2 d. This is equivalent to say each read is extended towards 5'->3' into a d size fragment. DEFAULT: 200. EXTSIZE and SHIFT can be combined when necessary. Check SHIFT option.")
 240     group_bimodal.add_argument("--bw", dest="bw", type=int, default=300,
 241                                help="Band width for picking regions to compute fragment size. This value is only used while building the shifting model. Tweaking this is not recommended. DEFAULT: 300")
 242     group_bimodal.add_argument("--d-min", dest="d_min", type=int, default=20,
 243                                help="Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
 244     group_bimodal.add_argument("-m", "--mfold", dest="mfold", type=int, default=[5, 50], nargs=2,
 245                                help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". This setting is only used while building the shifting model. Tweaking it is not recommended. DEFAULT:5 50")
 246
 247     group_bimodal.add_argument("--fix-bimodal", dest="onauto", action="store_true",
 248                                help="Whether turn on the auto pair model process. If set, when MACS failed to build paired model, it will use the nomodel settings, the --exsize parameter to extend each tags towards 3' direction. Not to use this automate fixation is a default behavior now. DEFAULT: False",
 249                                default=False)
 250
 251     # General options.
 252     group_callpeak = argparser_callpeak.add_argument_group("Peak calling arguments")
 253     p_or_q_group = group_callpeak.add_mutually_exclusive_group()
 254     p_or_q_group.add_argument("-q", "--qvalue", dest="qvalue", type=float, default=0.05,
 255                               help="Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. -q, and -p are mutually exclusive.")
 256     p_or_q_group.add_argument("-p", "--pvalue", dest="pvalue", type=float,
 257                               help="Pvalue cutoff for peak detection. DEFAULT: not set. -q, and -p are mutually exclusive. If pvalue cutoff is set, qvalue will not be calculated and reported as -1 in the final .xls file.")
 258
 259     # about scaling
 260     group_callpeak.add_argument("--scale-to", dest="scaleto", type=str, choices=("large", "small"),
 261                                 help="When set to 'small', scale the larger sample up to the smaller sample. When set to 'larger', scale the smaller sample up to the bigger sample. By default, scale to 'small'. This option replaces the obsolete '--to-large' option. The default behavior is recommended since it will lead to less significant p/q-values in general but more specific results. Keep in mind that scaling down will influence control/input sample more. DEFAULT: 'small', the choice is either 'small' or 'large'.")
 262     group_callpeak.add_argument("--down-sample", dest="downsample", action="store_true", default=False,
 263                                 help="When set, random sampling method will scale down the bigger sample. By default, MACS uses linear scaling. Warning: This option will make your result unstable and irreproducible since each time, random reads would be selected. Consider to use 'randsample' script instead. <not implmented>If used together with --SPMR, 1 million unique reads will be randomly picked.</not implemented> Caution: due to the implementation, the final number of selected reads may not be as you expected! DEFAULT: False")
 264     group_callpeak.add_argument("--seed", dest="seed", type=int, default=-1,
 265                                 help="Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. DEFAULT: not set")
 266     group_callpeak.add_argument("--tempdir", dest="tempdir", default=tempfile.gettempdir(),
 267                                 help="Optional directory to store temp files. DEFAULT: %(default)s")
 268     group_callpeak.add_argument("--nolambda", dest="nolambda", action="store_true",
 269                                 help="If True, MACS will use fixed background lambda as local lambda for every peak region. Normally, MACS calculates a dynamic local lambda to reflect the local bias due to the potential chromatin accessibility. ",
 270                                 default=False)
 271     group_callpeak.add_argument("--slocal", dest="smalllocal", type=int, default=1000,
 272                                 help="The small nearby region in basepairs to calculate dynamic lambda. This is used to capture the bias near the peak summit region. Invalid if there is no control data. If you set this to 0, MACS will skip slocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 1000 ")
 273     group_callpeak.add_argument("--llocal", dest="largelocal", type=int, default=10000,
 274                                 help="The large nearby region in basepairs to calculate dynamic lambda. This is used to capture the surround bias. If you set this to 0, MACS will skip llocal lambda calculation. *Note* that MACS will always perform a d-size local lambda calculation while the control data is available. The final local bias would be the maximum of the lambda value from d, slocal, and llocal size windows. While control is not available, d and slocal lambda won't be considered. DEFAULT: 10000.")
 275     group_callpeak.add_argument("--max-gap", dest="maxgap", type=int,
 276                                 help="Maximum gap between significant sites to cluster them together. The DEFAULT value is the detected read length/tag size.")
 277     group_callpeak.add_argument("--min-length", dest="minlen", type=int,
 278                                 help="Minimum length of a peak. The DEFAULT value is the predicted fragment size d. Note, if you set a value smaller than the fragment size, it may have NO effect on the result. For BROAD peak calling, try to set a large value such as 500bps. You can also use '--cutoff-analysis' option with default setting, and  check the column 'avelpeak' under different cutoff values to decide a reasonable minlen value.")
 279     group_callpeak.add_argument("--broad", dest="broad", action="store_true",
 280                                 help="If set, MACS will try to call broad peaks using the --broad-cutoff setting. Please tweak '--broad-cutoff' setting to control the peak calling behavior. At the meantime, either -q or -p cutoff will be used to define regions with 'stronger enrichment' inside of broad peaks. The maximum gap is expanded to 4 * MAXGAP (--max-gap parameter). As a result, MACS will output a 'gappedPeak' and a 'broadPeak' file instead of 'narrowPeak' file. Note, a broad peak will be reported even if there is no 'stronger enrichment' inside. DEFAULT: False", default=False)
 281     group_callpeak.add_argument("--broad-cutoff", dest="broadcutoff", type=float, default=0.1,
 282                                 help="Cutoff for broad region. This option is not available unless --broad is set. If -p is set, this is a pvalue cutoff, otherwise, it's a qvalue cutoff. Please note that in broad peakcalling mode, MACS3 uses this setting to control the overall peak calling behavior, then uses -q or -p setting to define regions inside broad region as 'stronger' enrichment. DEFAULT: 0.1 ")
 283     group_callpeak.add_argument("--cutoff-analysis", dest="cutoff_analysis", action="store_true",
 284                                 help="While set, MACS3 will analyze number or total length of peaks that can be called by different p-value cutoff then output a summary table to help user decide a better cutoff. The table will be saved in NAME_cutoff_analysis.txt file. Note, minlen and maxgap may affect the results. WARNING: May take  ~30 folds longer time to finish. The result can be useful for users to decide a reasonable cutoff value. DEFAULT: False", default=False)
 285
 286     # post-processing options
 287     group_postprocessing = argparser_callpeak.add_argument_group("Post-processing options")
 288     postprocess_group = group_postprocessing.add_mutually_exclusive_group()
 289
 290     postprocess_group.add_argument("--call-summits", dest="call_summits", action="store_true",
 291                                    help="If set, MACS will use a more sophisticated signal processing approach to find subpeak summits in each enriched peak region. DEFAULT: False", default=False)
 292     group_postprocessing.add_argument("--fe-cutoff", dest="fecutoff", type=float, default=1.0,
 293                                       help="When set, the value will be used as the minimum requirement to filter out peaks with low fold-enrichment. Note, MACS3 adds one as pseudocount while calculating fold-enrichment. By default, it is set as 1 so there is no filtering. DEFAULT: 1.0")
 294
 295     # obsolete options
 296     group_obsolete = argparser_callpeak.add_argument_group("Obsolete options")
 297     group_obsolete.add_argument("--to-large", dest="tolarge", action="store_true", default=False,
 298                                 help="Obsolete option. Please use '--scale-to large' instead.")
 299     group_obsolete.add_argument("--ratio", dest="ratio", type=float, default=1.0,
 300                                 help="Obsolete option. Originally designed to normalize treatment and control with customized ratio, now it won't have any effect.")
 301
 302     # other options
 303     group_other = argparser_callpeak.add_argument_group("Other options")
 304     group_other.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 305                              help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 306
 307     return
 308
 309
 310 def add_filterdup_parser(subparsers):
 311     argparser_filterdup = subparsers.add_parser("filterdup",
 312                                                 help="Remove duplicate reads, then save in BED/BEDPE format file.")
 313     argparser_filterdup.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+",
 314                                      help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.")
 315     argparser_filterdup.add_argument("-f", "--format", dest="format", type=str,
 316                                      choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"),
 317                                      help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
 318                                      default="AUTO")
 319     argparser_filterdup.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs",
 320                                      help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
 321     argparser_filterdup.add_argument("-s", "--tsize", dest="tsize", type=int,
 322                                      help="Tag size. This will override the auto detected tag size. DEFAULT: Not set")
 323     argparser_filterdup.add_argument("-p", "--pvalue", dest="pvalue", type=float,
 324                                      help="Pvalue cutoff for binomial distribution test. DEFAULT:1e-5")
 325     argparser_filterdup.add_argument("--keep-dup", dest="keepduplicates", type=str, default="auto",
 326                                      help="It controls the '%(prog)s' behavior towards duplicate tags/pairs at the exact same location -- the same coordination and the same strand. The 'auto' option makes '%(prog)s' calculate the maximum tags at the exact same location based on binomal distribution using given -p as pvalue cutoff; and the 'all' option keeps every tags (useful if you only want to convert formats). If an integer is given, at most this number of tags will be kept at the same location. Note, MACS3 callpeak function uses KEEPDUPLICATES=1 as default. Note, if you've used samtools or picard to flag reads as 'PCR/Optical duplicate' in bit 1024, MACS3 will still read them although the reads may be decided by MACS3 as duplicate later. Default: auto")
 327     argparser_filterdup.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 328                                      help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 329     argparser_filterdup.add_argument("--verbose", dest="verbose", type=int, default=2,
 330                                      help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2")
 331     add_outdir_option(argparser_filterdup)
 332     argparser_filterdup.add_argument("-o", "--ofile", dest="outputfile", type=str,
 333                                      help="Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
 334                                      default="stdout")
 335     argparser_filterdup.add_argument("-d", "--dry-run", dest="dryrun", action="store_true", default=False,
 336                                      help="When set, filterdup will only output numbers instead of writing output files, including maximum allowable duplicates, total number of reads before filtering, total number of reads after filtering, and redundant rate. Default: not set")
 337     return
 338
 339 def add_bdgpeakcall_parser(subparsers):
 340     """Add function 'peak calling on bedGraph' argument parsers.
 341     """
 342     argparser_bdgpeakcall = subparsers.add_parser("bdgpeakcall",
 343                                                   help="Call peaks from bedGraph file.")
 344     argparser_bdgpeakcall.add_argument("-i", "--ifile", dest="ifile", type=str, required=True,
 345                                        help="MACS score in bedGraph. REQUIRED")
 346     argparser_bdgpeakcall.add_argument("-c", "--cutoff", dest="cutoff", type=float,
 347                                        help="Cutoff depends on which method you used for score track. If the file contains pvalue scores from MACS3, score 5 means pvalue 1e-5. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 5", default=5)
 348     argparser_bdgpeakcall.add_argument("-l", "--min-length", dest="minlen", type=int,
 349                                        help="minimum length of peak, better to set it as d value. DEFAULT: 200", default=200)
 350     argparser_bdgpeakcall.add_argument("-g", "--max-gap", dest="maxgap", type=int,
 351                                        help="maximum gap between significant points in a peak, better to set it as tag size. DEFAULT: 30", default=30)
 352     argparser_bdgpeakcall.add_argument("--call-summits", dest="call_summits", action="store_true", help=ap.SUPPRESS, default=False)
 353     #                         help="If set, MACS will use a more sophisticated approach to find all summits in each enriched peak region. DEFAULT: False",default=False)
 354     argparser_bdgpeakcall.add_argument("--cutoff-analysis", dest="cutoff_analysis", action="store_true",
 355                                        help="While set, bdgpeakcall will analyze number or total length of peaks that can be called by different cutoff then output a summary table to help user decide a better cutoff. Note, minlen and maxgap may affect the results. DEFAULT: False", default=False)
 356     argparser_bdgpeakcall.add_argument("--cutoff-analysis-max", dest="cutoff_analysis_max", type=int,
 357                                        help="The maximum cutoff score for performing cutoff analysis. Together with --cutoff-analysis-steps, the resolution in the final report can be controlled. Please check the description in --cutoff-analysis-steps for detail. DEFAULT: 100",
 358                                        default=100)
 359     argparser_bdgpeakcall.add_argument("--cutoff-analysis-steps", dest="cutoff_analysis_steps", type=int,
 360                                        help="Steps for performing cutoff analysis. It will be used to decide which cutoff value should be included in the final report. Larger the value, higher resolution the cutoff analysis can be. The cutoff analysis function will first find the smallest (at least 0) and the largest (controlled by --cutoff-analysis-max) score in the data, then break the range of score into `CUTOFF_ANALYSIS_STEPS` intervals. It will then use each score as cutoff to call peaks and calculate the total number of candidate peaks, the total basepairs of peaks, and the average length of peak in basepair. Please note that the final report ideally should include `CUTOFF_ANALYSIS_STEPS` rows, but in practice, if the cutoff yield zero peak, the row for that value won't be included.  DEFAULT: 100", default=100)
 361     argparser_bdgpeakcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
 362                                        help="Tells MACS not to include trackline with bedGraph files. The trackline is used by UCSC for the options for display.")
 363     argparser_bdgpeakcall.add_argument("--verbose", dest="verbose", type=int, default=2,
 364                                        help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 365
 366     add_outdir_option(argparser_bdgpeakcall)
 367     add_output_group(argparser_bdgpeakcall)
 368
 369     return
 370
 371 def add_bdgbroadcall_parser(subparsers):
 372     """Add function 'broad peak calling on bedGraph' argument parsers.
 373     """
 374     argparser_bdgbroadcall = subparsers.add_parser("bdgbroadcall",
 375                                                    help="Call nested broad peaks from bedGraph file.")
 376     argparser_bdgbroadcall.add_argument("-i", "--ifile", dest="ifile", type=str, required=True,
 377                                         help="MACS score in bedGraph. REQUIRED")
 378     argparser_bdgbroadcall.add_argument("-c", "--cutoff-peak", dest="cutoffpeak", type=float,
 379                                         help="Cutoff for peaks depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 2 means qvalue 0.01. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 2",
 380                                         default=2)
 381     argparser_bdgbroadcall.add_argument("-C", "--cutoff-link", dest="cutofflink", type=float,
 382                                         help="Cutoff for linking regions/low abundance regions depending on which method you used for score track. If the file contains qvalue scores from MACS3, score 1 means qvalue 0.1, and score 0.3 means qvalue 0.5. DEFAULT: 1", default=1)
 383     argparser_bdgbroadcall.add_argument("-l", "--min-length", dest="minlen", type=int,
 384                                         help="minimum length of peak, better to set it as d value. DEFAULT: 200", default=200)
 385     argparser_bdgbroadcall.add_argument("-g", "--lvl1-max-gap", dest="lvl1maxgap", type=int,
 386                                         help="maximum gap between significant peaks, better to set it as tag size. DEFAULT: 30", default=30)
 387     argparser_bdgbroadcall.add_argument("-G", "--lvl2-max-gap", dest="lvl2maxgap", type=int,
 388                                         help="maximum linking between significant peaks, better to set it as 4 times of d value. DEFAULT: 800", default=800)
 389     argparser_bdgbroadcall.add_argument("--no-trackline", dest="trackline", action="store_false", default=True,
 390                                         help="Tells MACS not to include trackline with bedGraph files. The trackline is required by UCSC.")
 391     argparser_bdgbroadcall.add_argument("--verbose", dest="verbose", type=int, default=2,
 392                                         help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 393     add_outdir_option(argparser_bdgbroadcall)
 394     add_output_group(argparser_bdgbroadcall)
 395     return
 396
 397 def add_bdgcmp_parser(subparsers):
 398     """Add function 'peak calling on bedGraph' argument parsers.
 399     """
 400     argparser_bdgcmp = subparsers.add_parser("bdgcmp",
 401                                              help="Comparing two signal tracks in bedGraph format.")
 402     argparser_bdgcmp.add_argument("-t", "--tfile", dest="tfile", type=str, required=True,
 403                                   help="Treatment bedGraph file, e.g. *_treat_pileup.bdg from MACSv2. REQUIRED")
 404     argparser_bdgcmp.add_argument("-c", "--cfile", dest="cfile", type=str, required=True,
 405                                   help="Control bedGraph file, e.g. *_control_lambda.bdg from MACSv2. REQUIRED")
 406     argparser_bdgcmp.add_argument("-S", "--scaling-factor", dest="sfactor", type=float, default=1.0,
 407                                   help="Scaling factor for treatment and control track. Keep it as 1.0 or default in most cases. Set it ONLY while you have SPMR output from MACS3 callpeak, and plan to calculate scores as MACS3 callpeak module. If you want to simulate 'callpeak' w/o '--to-large', calculate effective smaller sample size after filtering redudant reads in million (e.g., put 31.415926 if effective reads are 31,415,926) and input it for '-S'; for 'callpeak --to-large', calculate effective reads in larger sample. DEFAULT: 1.0")
 408     argparser_bdgcmp.add_argument("-p", "--pseudocount", dest="pseudocount", type=float, default=0.0,
 409                                   help="The pseudocount used for calculating logLR, logFE or FE. The count will be applied after normalization of sequencing depth. DEFAULT: 0.0, no pseudocount is applied.")
 410
 411     argparser_bdgcmp.add_argument("-m", "--method", dest="method", type=str, nargs="+",
 412                                   choices=("ppois", "qpois", "subtract", "logFE", "FE", "logLR", "slogLR", "max"),
 413                                   help="Method to use while calculating a score in any bin by comparing treatment value and control value. Available choices are: ppois, qpois, subtract, logFE, FE, logLR, slogLR, and max. They represent Poisson Pvalue (-log10(pvalue) form) using control as lambda and treatment as observation, q-value through a BH process for poisson pvalues, subtraction from treatment, linear scale fold enrichment, log10 fold enrichment(need to set pseudocount), log10 likelihood between ChIP-enriched model and open chromatin model(need to set pseudocount), symmetric log10 likelihood between two ChIP-enrichment models, or maximum value between the two tracks. Default option is ppois.",default="ppois")
 414     argparser_bdgcmp.add_argument("--verbose", dest="verbose", type=int, default=2,
 415                                   help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 416
 417     add_outdir_option(argparser_bdgcmp)
 418     output_group = argparser_bdgcmp.add_mutually_exclusive_group(required=True)
 419     output_group.add_argument("--o-prefix", dest="oprefix", type=str,
 420                               help="The PREFIX of output bedGraph file to write scores. If it is given as A, and method is 'ppois', output file will be A_ppois.bdg. Mutually exclusive with -o/--ofile.")
 421     output_group.add_argument("-o", "--ofile", dest="ofile", type=str, nargs="+",
 422                               help="Output filename. Mutually exclusive with --o-prefix. The number and the order of arguments for --ofile must be the same as for -m.")
 423     return
 424
 425 def add_bdgopt_parser(subparsers):
 426     """Add function 'operations on score column of bedGraph' argument parsers.
 427     """
 428     argparser_bdgopt = subparsers.add_parser("bdgopt",
 429                                              help="Operate the score column of bedGraph file.")
 430     argparser_bdgopt.add_argument("-i", "--ifile", dest="ifile", type=str, required=True,
 431                                   help="MACS score in bedGraph. Note: this must be a bedGraph file covering the ENTIRE genome. REQUIRED")
 432     argparser_bdgopt.add_argument("-m", "--method", dest="method", type=str,
 433                                   choices=("multiply", "add", "p2q", "max", "min"),
 434                                   help="Method to modify the score column of bedGraph file. Available choices are: multiply, add, max, min, or p2q. 1) multiply, the EXTRAPARAM is required and will be multiplied to the score column. If you intend to divide the score column by X, use value of 1/X as EXTRAPARAM. 2) add, the EXTRAPARAM is required and will be added to the score column. If you intend to subtract the score column by X, use value of -X as EXTRAPARAM. 3) max, the EXTRAPARAM is required and will take the maximum value between score and the EXTRAPARAM. 4) min, the EXTRAPARAM is required and will take the minimum value between score and the EXTRAPARAM. 5) p2q, this will convert p-value scores to q-value scores using Benjamini-Hochberg process. The EXTRAPARAM is not required. This method assumes the scores are -log10 p-value from MACS3. Any other types of score will cause unexpected errors.", default="p2q")
 435     argparser_bdgopt.add_argument("-p", "--extra-param", dest="extraparam", type=float, nargs="*",
 436                                   help="The extra parameter for METHOD. Check the detail of -m option.")
 437     add_outdir_option(argparser_bdgopt)
 438     argparser_bdgopt.add_argument("-o", "--ofile", dest="ofile", type=str,
 439                                   help="Output BEDGraph filename.", required=True)
 440     argparser_bdgopt.add_argument("--verbose", dest="verbose", type=int, default=2,
 441                                   help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 442     return
 443
 444
 445 def add_cmbreps_parser(subparsers):
 446     """Add function 'combine replicates' argument parsers.
 447     """
 448     argparser_cmbreps = subparsers.add_parser("cmbreps",
 449                                               help="Combine bedGraph files of scores from replicates.")
 450     argparser_cmbreps.add_argument("-i", dest="ifile", type=str, required=True, nargs="+",
 451                                    help="MACS score in bedGraph for each replicate. Require at least 2 files such as '-i A B C D'. REQUIRED")
 452     # argparser_cmbreps.add_argument("-w", dest="weights", type=float, nargs="*",
 453     #                                help="Weight for each replicate. Default is 1.0 for each. When given, require same number of parameters as IFILE.")
 454     argparser_cmbreps.add_argument("-m", "--method", dest="method", type=str,
 455                                    choices=("fisher", "max", "mean"),
 456                                    help="Method to use while combining scores from replicates. 1) fisher: Fisher's combined probability test. It requires scores in ppois form (-log10 pvalues) from bdgcmp. Other types of scores for this method may cause cmbreps unexpected errors. 2) max: take the maximum value from replicates for each genomic position. 3) mean: take the average value. Note, except for Fisher's method, max or mean will take scores AS IS which means they won't convert scores from log scale to linear scale or vice versa.", default="fisher")
 457     add_outdir_option(argparser_cmbreps)
 458     argparser_cmbreps.add_argument("-o", "--ofile", dest="ofile", type=str, required=True,
 459                                    help="Output BEDGraph filename for combined scores.")
 460     argparser_cmbreps.add_argument("--verbose", dest="verbose", type=int, default=2,
 461                                    help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 462     return
 463
 464
 465 def add_randsample_parser(subparsers):
 466     argparser_randsample = subparsers.add_parser("randsample",
 467                                                  help="Randomly choose a number/percentage of total reads, then save in BED/BEDPE format file.")
 468     argparser_randsample.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+",
 469                                       help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.")
 470
 471     p_or_n_group = argparser_randsample.add_mutually_exclusive_group(required=True)
 472     p_or_n_group.add_argument("-p", "--percentage", dest="percentage", type=float,
 473                               help="Percentage of tags you want to keep. Input 80.0 for 80%%. This option can't be used at the same time with -n/--num. If the setting is 100, it will keep all the reads and convert any format that MACS3 supports into BED or BEDPE (if input is BAMPE) format. REQUIRED")
 474     p_or_n_group.add_argument("-n", "--number", dest="number", type=float,
 475                               help="Number of tags you want to keep. Input 8000000 or 8e+6 for 8 million. This option can't be used at the same time with -p/--percent. Note that the number of tags in output is approximate as the number specified here. REQUIRED")
 476     argparser_randsample.add_argument("--seed", dest="seed", type=int, default=-1,
 477                                       help="Set the random seed while down sampling data. Must be a non-negative integer in order to be effective. If you want more reproducible results, please specify a random seed and record it.DEFAULT: not set")
 478     argparser_randsample.add_argument("-o", "--ofile", dest="outputfile", type=str,
 479                                       help="Output BED file name. If not specified, will write to standard output. Note, if the input format is BAMPE or BEDPE, the output will be in BEDPE format. DEFAULT: stdout",
 480                                       default=None)
 481     add_outdir_option(argparser_randsample)
 482     argparser_randsample.add_argument("-s", "--tsize", dest="tsize", type=int, default=None,
 483                                       help="Tag size. This will override the auto detected tag size. DEFAULT: Not set")
 484     argparser_randsample.add_argument("-f", "--format", dest="format", type=str,
 485                                       choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"),
 486                                       help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will %(prog)s decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE or BAMPE/BEDPE. DEFAULT: \"AUTO\"",
 487                                       default="AUTO")
 488     argparser_randsample.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 489                                       help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 490
 491     argparser_randsample.add_argument("--verbose", dest="verbose", type=int, default=2,
 492                                       help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2")
 493     return
 494
 495
 496 def add_bdgdiff_parser(subparsers):
 497     argparser_bdgdiff = subparsers.add_parser("bdgdiff",
 498                                               help="Differential peak detection based on paired four bedGraph files.")
 499     argparser_bdgdiff.add_argument("--t1", dest="t1bdg", type=str, required=True,
 500                                    help="MACS pileup bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED")
 501     argparser_bdgdiff.add_argument("--t2", dest="t2bdg", type=str, required=True,
 502                                    help="MACS pileup bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED")
 503     argparser_bdgdiff.add_argument("--c1", dest="c1bdg", type=str, required=True,
 504                                    help="MACS control lambda bedGraph for condition 1. Incompatible with callpeak --SPMR output. REQUIRED")
 505     argparser_bdgdiff.add_argument("--c2", dest="c2bdg", type=str, required=True,
 506                                    help="MACS control lambda bedGraph for condition 2. Incompatible with callpeak --SPMR output. REQUIRED")
 507     argparser_bdgdiff.add_argument("-C", "--cutoff", dest="cutoff", type=float,
 508                                    help="log10LR cutoff. Regions with signals lower than cutoff will not be considerred as enriched regions. DEFAULT: 3 (likelihood ratio=1000)", default=3)
 509     argparser_bdgdiff.add_argument("-l", "--min-len", dest="minlen", type=int,
 510                                    help="Minimum length of differential region. Try bigger value to remove small regions. DEFAULT: 200", default=200)
 511     argparser_bdgdiff.add_argument("-g", "--max-gap", dest="maxgap", type=int,
 512                                    help="Maximum gap to merge nearby differential regions. Consider a wider gap for broad marks. Maximum gap should be smaller than minimum length (-g). DEFAULT: 100", default=100)
 513     argparser_bdgdiff.add_argument("--d1", "--depth1", dest="depth1", type=float, default=1.0,
 514                                    help="Sequencing depth (# of non-redundant reads in million) for condition 1. It will be used together with --d2. See description for --d2 below for how to assign them. Default: 1")
 515     argparser_bdgdiff.add_argument("--d2", "--depth2", dest="depth2", type=float, default=1.0,
 516                                    help="Sequencing depth (# of non-redundant reads in million) for condition 2. It will be used together with --d1. DEPTH1 and DEPTH2 will be used to calculate scaling factor for each sample, to down-scale larger sample to the level of smaller one. For example, while comparing 10 million condition 1 and 20 million condition 2, use --d1 10 --d2 20, then pileup value in bedGraph for condition 2 will be divided by 2. Default: 1")
 517     argparser_bdgdiff.add_argument("--verbose", dest="verbose", type=int, default=2,
 518                                    help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 519
 520     add_outdir_option(argparser_bdgdiff)
 521     output_group = argparser_bdgdiff.add_mutually_exclusive_group(required=True)
 522     output_group.add_argument("--o-prefix", dest="oprefix", type=str,
 523                               help="Output file prefix. Actual files will be named as PREFIX_cond1.bed, PREFIX_cond2.bed and PREFIX_common.bed. Mutually exclusive with -o/--ofile.")
 524     output_group.add_argument("-o", "--ofile", dest="ofile", type=str, nargs=3,
 525                               help="Output filenames. Must give three arguments in order: 1. file for unique regions in condition 1; 2. file for unique regions in condition 2; 3. file for common regions in both conditions. Note: mutually exclusive with --o-prefix.")
 526
 527     return
 528
 529 def add_refinepeak_parser(subparsers):
 530     argparser_refinepeak = subparsers.add_parser("refinepeak",
 531                                                  help="Take raw reads alignment, refine peak summits. Inspired by SPP.")
 532     argparser_refinepeak.add_argument("-b", dest="bedfile", type=str, required=True,
 533                                       help="Candidate peak file in BED format. REQUIRED.")
 534     argparser_refinepeak.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+",
 535                                       help="ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. Note that pair-end data is not supposed to work with this command. REQUIRED.")
 536     argparser_refinepeak.add_argument("-f", "--format", dest="format", type=str,
 537                                       choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE"),
 538                                       help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\". The default AUTO option will let '%(prog)s' decide which format the file is. Please check the definition in README file if you choose ELAND/ELANDMULTI/ELANDEXPORT/SAM/BAM/BOWTIE. DEFAULT: \"AUTO\"",
 539                                       default="AUTO")
 540     argparser_refinepeak.add_argument("-c", "--cutoff", dest="cutoff", type=float,
 541                                       help="Cutoff. Regions with SPP wtd score lower than cutoff will not be considerred.  DEFAULT: 5", default=5)
 542     argparser_refinepeak.add_argument("-w", "--window-size", dest="windowsize", help='Scan window size on both side of the summit (default: 100bp)',
 543                                       type=int, default=200)
 544     argparser_refinepeak.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 545                                       help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 546
 547     argparser_refinepeak.add_argument("--verbose", dest="verbose", type=int, default=2,
 548                                       help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2")
 549
 550     add_outdir_option(argparser_refinepeak)
 551     add_output_group(argparser_refinepeak)
 552
 553     return
 554
 555
 556 def add_predictd_parser(subparsers):
 557     """Add main function 'predictd' argument parsers.
 558     """
 559     argparser_predictd = subparsers.add_parser("predictd", help="Predict d or fragment size from alignment results. In case of PE data, report the average insertion/fragment size from all pairs.")
 560
 561     # group for input files
 562     argparser_predictd.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+",
 563                                     help="ChIP-seq alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.")
 564     argparser_predictd.add_argument("-f", "--format", dest="format", type=str,
 565                                     choices=("AUTO", "BAM", "SAM", "BED", "ELAND",
 566                                              "ELANDMULTI", "ELANDEXPORT", "BOWTIE",
 567                                              "BAMPE", "BEDPE"),
 568                                     help="Format of tag file, \"AUTO\", \"BED\" or \"ELAND\" or \"ELANDMULTI\" or \"ELANDEXPORT\" or \"SAM\" or \"BAM\" or \"BOWTIE\" or \"BAMPE\" or \"BEDPE\". The default AUTO option will let MACS decide which format the file is. However, if you want to decide the average insertion size/fragment size from PE data such as BEDPE or BAMPE, please specify the format as BAMPE or BEDPE since MACS3 won't automatically recognize three two formats with -f AUTO. Please be aware that in PE mode, -g, -s, --bw, --d-min, -m, and --rfile have NO effect. DEFAULT: \"AUTO\"",
 569                                     default="AUTO")
 570     argparser_predictd.add_argument("-g", "--gsize", dest="gsize", type=str, default="hs",
 571                                     help="Effective genome size. It can be 1.0e+9 or 1000000000, or shortcuts:'hs' for human (2,913,022,398), 'mm' for mouse (2,652,783,500), 'ce' for C. elegans (100,286,401) and 'dm' for fruitfly (142,573,017), Default:hs. The effective genome size numbers for the above four species are collected from Deeptools https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html Please refer to deeptools to define the best genome size you plan to use.")
 572     argparser_predictd.add_argument("-s", "--tsize",  dest="tsize", type=int, default=None,
 573                                     help="Tag size. This will override the auto detected tag size. DEFAULT: Not set")
 574     argparser_predictd.add_argument("--bw", dest="bw", type=int, default=300,
 575                                     help="Band width for picking regions to compute fragment size. This value is only used while building the shifting model. DEFAULT: 300")
 576     argparser_predictd.add_argument("--d-min", dest="d_min", type=int, default=20,
 577                                     help="Minimum fragment size in basepair. Any predicted fragment size less than this will be excluded. DEFAULT: 20")
 578     argparser_predictd.add_argument("-m", "--mfold", dest="mfold", type=int, default=[5,50], nargs=2,
 579                                     help="Select the regions within MFOLD range of high-confidence enrichment ratio against background to build model. Fold-enrichment in regions must be lower than upper limit, and higher than the lower limit. Use as \"-m 10 30\". DEFAULT:5 50")
 580
 581     add_outdir_option(argparser_predictd)
 582     argparser_predictd.add_argument("--rfile", dest="rfile", type=str, default="predictd_model.R",
 583                                     help="PREFIX of filename of R script for drawing X-correlation figure. DEFAULT:'predictd_model.R' and R file will be predicted_model.R")
 584     argparser_predictd.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 585                                     help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 586
 587     argparser_predictd.add_argument("--verbose", dest="verbose", type=int, default=2,
 588                                     help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 589
 590     return
 591
 592
 593 def add_pileup_parser(subparsers):
 594     argparser_pileup = subparsers.add_parser("pileup",
 595                                              help="Pileup aligned reads (single-end) or fragments (paired-end).")
 596     argparser_pileup.add_argument("-i", "--ifile", dest="ifile", type=str, required=True, nargs="+",
 597                                   help="Alignment file. If multiple files are given as '-t A B C', then they will all be read and combined. REQUIRED.")
 598     argparser_pileup.add_argument("-o", "--ofile", dest="outputfile", type=str, required=True,
 599                                   help="Output bedGraph file name. If not specified, will write to standard output. REQUIRED.")
 600     add_outdir_option(argparser_pileup)
 601     argparser_pileup.add_argument("-f", "--format", dest="format", type=str,
 602                                   choices=("AUTO", "BAM", "SAM", "BED", "ELAND", "ELANDMULTI", "ELANDEXPORT", "BOWTIE", "BAMPE", "BEDPE"),
 603                                   help="Format of tag file, \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\", \"BOWTIE\", \"BAMPE\", or \"BEDPE\". The default AUTO option will let '%(prog)s' decide which format the file is. DEFAULT: \"AUTO\", MACS3 will pick a format from \"AUTO\", \"BED\", \"ELAND\", \"ELANDMULTI\", \"ELANDEXPORT\", \"SAM\", \"BAM\" and \"BOWTIE\". If the format is BAMPE or BEDPE, please specify it explicitly. Please note that when the format is BAMPE or BEDPE, the -B and --extsize options would be ignored.",
 604                                   default="AUTO")
 605     argparser_pileup.add_argument("-B", "--both-direction", dest="bothdirection", action="store_true", default=False,
 606                                   help="By default, any read will be extended towards downstream direction by extension size. So it's [0,size-1] (1-based index system) for plus strand read and [-size+1,0] for minus strand read where position 0 is 5' end of the aligned read. Default behavior can simulate MACS3 way of piling up ChIP sample reads where extension size is set as fragment size/d. If this option is set as on, aligned reads will be extended in both upstream and downstream directions by extension size. It means [-size,size] where 0 is the 5' end of a aligned read. It can partially simulate MACS3 way of piling up control reads. However MACS3 local bias is calculated by maximizing the expected pileup over a ChIP fragment size/d estimated from 10kb, 1kb, d and whole genome background. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: False")
 607
 608     argparser_pileup.add_argument("--extsize", dest="extsize", type=int, default=200,
 609                                   help="The extension size in bps. Each alignment read will become a EXTSIZE of fragment, then be piled up. Check description for -B for detail. It's twice the `shiftsize` in old MACSv1 language. This option will be ignored when the format is set as BAMPE or BEDPE. DEFAULT: 200 ")
 610     argparser_pileup.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 611                                   help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 612
 613     argparser_pileup.add_argument("--verbose", dest="verbose", type=int, default=2,
 614                                   help="Set verbose level. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. If you want to know where are the duplicate reads, use 3. DEFAULT:2")
 615     return
 616
 617
 618 def add_callvar_parser(subparsers):
 619     """Add function 'variant calling' argument parsers.
 620     """
 621     argparser_callvar = subparsers.add_parser("callvar",
 622                                               formatter_class=ap.RawDescriptionHelpFormatter,
 623                                               help="Call variants in given peak regions from the alignment BAM files.",
 624                                               epilog=""" Assuming you have two types of BAM files. The first type, what we
 625 call `TREAT`, is from DNA enrichment assay such as ChIP-seq or
 626 ATAC-seq where the DNA fragments in the sequencing library are
 627 enriched in certain genomics regions with potential allele biases; the
 628 second type, called `CTRL` for control, is from genomic assay in which
 629 the DNA enrichment is less biased in multiploid chromosomes and more
 630 uniform across the whole genome (the later one is optional). In order
 631 to run `callvar`, please sort (by coordinates) and index the BAM
 632 files. Example:
 633
 634 1. Sort the BAM file:
 635     $ samtools sort TREAT.bam -o TREAT_sorted.bam
 636     $ samtools sort CTRL.bam -o CTRL_sorted.bam
 637 2. Index the BAM file:
 638     $ samtools index TREAT_sorted.bam
 639     $ samtools index CTRL_sorted.bam
 640 3. Make sure .bai files are available:
 641     $ ls TREAT_sorted.bam.bai
 642     $ ls CTRL_sorted.bam.bai
 643
 644 To call variants:
 645     $ macs3 callvar -b peaks.bed -t TREAT_sorted.bam -c CTRL_sorted.bam -o peaks.vcf
 646 """)
 647     # group for input files
 648     group_input = argparser_callvar.add_argument_group("Input files arguments")
 649     group_input.add_argument("-b", "--peak", dest="peakbed", type=str, required =True,
 650                              help="Peak regions in BED format, sorted by coordinates. REQUIRED.")
 651     group_input.add_argument("-t", "--treatment", dest="tfile", type=str, required=True,
 652                              help="ChIP-seq/ATAC-seq treatment file in BAM format, sorted by coordinates. Make sure the .bai file is avaiable in the same directory. REQUIRED.")
 653     group_input.add_argument("-c", "--control", dest="cfile", type=str, required=False,
 654                              help="Optional control file in BAM format, sorted by coordinates. Make sure the .bai file is avaiable in the same directory.")
 655     # group for output files
 656     group_output = argparser_callvar.add_argument_group("Output arguments")
 657     add_outdir_option(group_output)
 658     group_output.add_argument("-o", "--ofile", dest="ofile", type=str, required=True,
 659                               help="Output VCF file name.")
 660     group_output.add_argument("--verbose", dest="verbose", type=int, default=2,
 661                               help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2")
 662     # group for parameters
 663     group_para = argparser_callvar.add_argument_group("Variant calling arguments")
 664     group_para.add_argument("-g", "--gq-hetero", dest="GQCutoffHetero", type=float,
 665                             help="Genotype Quality score (-10log10((L00+L11)/(L01+L00+L11))) cutoff for Heterozygous allele type. Default:0, or there is no cutoff on GQ.", default=0)
 666     group_para.add_argument("-G", "--gq-homo", dest="GQCutoffHomo", type=float,
 667                             help="Genotype Quality score (-10log10((L00+L01)/(L01+L00+L11))) cutoff for Homozygous allele (not the same as reference) type. Default:0, or ther is no cutoff on GQ.", default=0)
 668     group_para.add_argument("-Q", dest="Q", type=int, default=20,
 669                             help="Only consider bases with quality score greater than this value. Default: 20, which means Q20 or 0.01 error rate.")
 670     group_para.add_argument("-D", dest="maxDuplicate", type=int, default=1,
 671                             help="Maximum duplicated reads allowed per mapping position, mapping strand and the same CIGAR code. Default: 1. When sequencing depth is high, to set a higher value might help evaluate the correct allele ratio.")
 672     group_para.add_argument("-F", "--fermi", dest="fermi", type=str, default="auto",
 673                             help="Option to control when to apply local assembly through fermi-lite. By default (set as 'auto'), while callvar detects any INDEL variant in a peak region, it will utilize fermi-lite to recover the actual DNA sequences to refine the read alignments. If set as 'on', fermi-lite will be always invoked. It can increase specificity however sensivity and speed will be significantly lower. If set as 'off', Fermi won't be invoked at all. If so, speed and sensitivity can be higher but specificity will be significantly lower. Default: auto")
 674     group_para.add_argument("--fermi-overlap", dest="fermiMinOverlap", type=int,
 675                             help="The minimal overlap for fermi to initially assemble two reads. Must be between 1 and read length. A longer fermiMinOverlap is needed while read length is small (e.g. 30 for 36bp read, but 33 for 100bp read may work). Default:30", default=30)
 676     group_para.add_argument("--top2alleles-mratio", dest="top2allelesMinRatio", type=float,
 677                             help="The reads for the top 2 most frequent alleles (e.g. a ref allele and an alternative allele) at a loci shouldn't be too few comparing to total reads mapped. The minimum ratio is set by this optoin. Must be a float between 0.5 and 1.  Default:0.8 which means at least 80%% of reads contain the top 2 alleles.", default=0.8)
 678     group_para.add_argument("--altallele-count", dest="altalleleMinCount", type=int,
 679                             help="The count of the alternative (non-reference) allele at a loci shouldn't be too few. By default, we require at least two reads support the alternative allele. Default:2", default=2)
 680     group_para.add_argument("--max-ar", dest="maxAR", type=float,
 681                             help="The maximum Allele-Ratio allowed while calculating likelihood for allele-specific binding. If we allow higher maxAR, we may mistakenly assign some homozygous loci as heterozygous. Default:0.95", default=0.95)
 682     # group for misc
 683     group_misc = argparser_callvar.add_argument_group("Misc arguments")
 684     group_misc.add_argument("-m", "--multiple-processing", dest="np", type=int, default=1,
 685                             help="CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and  may negate benefit from multi processing. Default: 1")
 686     return
 687
 688
 689 def add_hmmratac_parser(subparsers):
 690     """Add function 'HMMRATAC' argument parsers.
 691     """
 692     argparser_hmmratac = subparsers.add_parser("hmmratac",
 693                                                formatter_class=ap.RawDescriptionHelpFormatter,
 694                                                help="Dedicated peak calling based on Hidden Markov Model for ATAC-seq data.",
 695                                                epilog="""HMMRATAC is a dedicated tool for processing ATAC-seq data
 696
 697 HMMRATAC, first released as a JAVA program in 2019, is a dedicated
 698 tool specifically designed for processing ATAC-seq data. In MACS3, it
 699 has been integrated as a subcommand (`hmmratac`). Reimagined and
 700 rewritten in Python and Cython, this tool's functionality might vary
 701 from its JAVA version. The core principle of HMMRATAC involves
 702 utilizing the Hidden Markov Model to learn the nucleosome structure
 703 around open chromatin regions.
 704
 705 Here's an example of how to run the `hmmratac` command:
 706
 707 ```
 708 $ macs3 hmmratac -i yeast.bam -n yeast
 709 ```
 710
 711 or with the BEDPE format
 712
 713 ```
 714 $ macs3 hmmratac -i yeast.bedpe.gz -f BEDPE -n yeast
 715 ```
 716
 717 Note: you can convert BAMPE to BEDPE by using
 718
 719 ```
 720 $ macs3 filterdup --keep-dup all -f BAMPE -i yeast.bam -o yeast.bedpe
 721 ```
 722
 723 The final output from `hmmratac` is in narrowPeak format containing
 724 the accessible regions (open state in `hmmratac` Hidden Markov
 725 Moedel). The columns in the narrowPeak file are:
 726
 727  1. chromosome name
 728  2. start position of the accessible region
 729  3. end position of the accesssible region
 730  4. peak name
 731  5. peak score. The peak score represents the maximum fold change
 732     (signal/average signal) within the peak. By default, the signal is
 733     the total pileup of all types of fragments, ranging from short to
 734     tri-nucleosome-sized fragments.
 735  6. Not used
 736  7. Not used
 737  8. Not used
 738  9. peak summit position. It's the relative position from the start
 739     position to the peak summit which is defined as the position with
 740     the maximum foldchange score.
 741
 742 Before proceeding, it's essential to carefully read the help messages
 743 concerning each option and the default parameters. There are several
 744 crucial parameters we usually need to specify:
 745
 746 1. Lower cutoff to select training regions through `-l`.
 747 2. Upper cutoff to select training regions through `-u`.
 748 3. Pre-scanning cutoff to select candidate regions for
 749    decoding/predicting the states, including open, nucleosome, and
 750    background states, through `-c`.
 751
 752 These three parameters can significantly influence the
 753 results. Therefore, it's highly recommended to run `macs3 hmmratac
 754 --cutoff-analysis-only -i your.bam` first, which will help you decide
 755 the parameters for `-l`, `-u`, and `-c`. Since there isn't an
 756 automatic way to determine these parameters, your judgement will be
 757 vital. Please read the output from `macs3 hmmratac
 758 --cutoff-analysis-only -i your.bam` and the following section `Choices
 759 of cutoff values` for guidance.
 760
 761 * Choices of cutoff values *
 762
 763 Before you proceed, it's highly recommended to run with
 764 `--cutoff-analysis-only` for the initial attempt. When this option is
 765 activated, `hmmratac` will use EM to estimate the best parameters for
 766 fragment sizes of short fragments, mono-, di-, and tri-nucleosomes,
 767 pileup fragments, convert the pileup values into fold-change, and
 768 analyze each possible cutoff. This analysis includes the number of
 769 peaks that can be called, their average peak length, and their total
 770 length. After the report is generated, you can review its contents and
 771 decide on the optimal `-l`, `-u`, and `-c`.
 772
 773 The report consists of four columns:
 774
 775 1. score: the possible fold change cutoff value.
 776 2. npeaks: the number of peaks under this cutoff.
 777 3. lpeaks: the total length of all peaks.
 778 4. avelpeak: the average length of peaks.
 779
 780 While there's no universal rule, here are a few suggestions:
 781
 782 - The lower cutoff should be the cutoff in the report that captures a
 783   moderate number (about 10k-30k) of peaks with a normal width (average
 784   length 500-1000bps long).
 785 - The upper cutoff should capture some (typically hundreds of)
 786   extremely high enrichment and unusually wide peaks in the
 787   report. The aim here is to exclude abnormal enrichment caused by
 788   artifacts such as repetitive regions.
 789 - The pre-scanning cutoff should be the cutoff close to the BOTTOM of
 790   the report that can capture a large number of potential peaks with a
 791   normal length (average length 500-1000bps). However, it's
 792   recommended not to use the lowest cutoff value in the report as this
 793   may include too much noise from the genome.
 794
 795 * Tune the HMM model *
 796
 797 It's highly recommended to check the runtime message of the HMM model
 798 after training. An example is like this:
 799
 800 ```
 801 #4 Train Hidden Markov Model with Multivariate Gaussian Emission
 802 #  Extract signals in training regions with bin size of 10
 803 #  Use Baum-Welch algorithm to train the HMM
 804 #   HMM converged: True
 805 #  Write HMM parameters into JSON: test_model.json
 806 #  The Hidden Markov Model for signals of binsize of 10 basepairs:
 807 #   open state index: state2
 808 #   nucleosomal state index: state1
 809 #   background state index: state0
 810 #   Starting probabilities of states:
 811 #                            bg        nuc       open
 812 #                        0.7994     0.1312    0.06942
 813 #   HMM Transition probabilities:
 814 #                            bg        nuc       open
 815 #               bg->     0.9842    0.01202   0.003759
 816 #              nuc->    0.03093     0.9562    0.01287
 817 #             open->   0.007891    0.01038     0.9817
 818 #   HMM Emissions (mean):
 819 #                         short       mono         di        tri
 820 #               bg:      0.2551      1.526     0.4646    0.07071
 821 #              nuc:       6.538      17.94      3.422    0.05819
 822 #             open:       5.016      17.47      6.897      2.121
 823 ```
 824
 825 We will 'guess' which hidden state is for the open region, which is
 826 the nucleosomal region, and which is the background. We compute from
 827 the HMM Emission matrix to pick the state with the highest sum of mean
 828 signals as the open state, the lowest as the backgound state, then the
 829 rest is the nucleosomal state. However it may not work in every
 830 case. In the above example, it may be tricky to call the second row as
 831 'nuc' and the third as 'open'. If the users want to exchange the state
 832 assignments of the 'nuc' and 'open', they can modify the state
 833 assignment in the HMM model file (e.g. test_model.json). For the above
 834 example, the model.json looks like this (we skipped some detail):
 835
 836 ```
 837 {"startprob": [...], "transmat": [...], "means": [...], "covars": [...],
 838 "covariance_type": "full", "n_features": 4,
 839 "i_open_region": 2, "i_background_region": 0, "i_nucleosomal_region": 1,
 840 "hmm_binsize": 10}
 841 ```
 842
 843 We can modify the assignment of: `"i_open_region": 2,
 844 "i_background_region": 0, "i_nucleosomal_region": 1,` by assigning `1`
 845 to open, and `2` to nucleosomal as: `"i_open_region": 1,
 846 "i_background_region": 0, "i_nucleosomal_region": 2,` Then save the
 847 HMM in a new model file such as `new_model.json`.
 848
 849 Then next, we can re-run `macs3 hmmratac` with the same parameters
 850 plus an extra option for the HMM model file like `macs3 hmmratac
 851 --model new_model.json`
 852
 853 """)
 854
 855     # group for input files
 856     group_input = argparser_hmmratac.add_argument_group("Input files arguments")
 857     group_input.add_argument("-i", "--input", dest="input_file", type=str, required=True, nargs="+",
 858                              help="Input files containing the aligment results for ATAC-seq paired end reads. If multiple files are given as '-t A B C', then they will all be read and pooled together. The file should be in BAMPE or BEDPE format (aligned in paired end mode). Files can be gzipped. Note: all files should be in the same format! REQUIRED.")
 859     group_input.add_argument("-f", "--format", dest="format", type=str,
 860                              choices=("BAMPE", "BEDPE"),
 861                              help="Format of input files, \"BAMPE\" or \"BEDPE\". If there are multiple files, they should be in the same format -- either BAMPE or BEDPE. Please check the definition in README. Also please note that the BEDPE only contains three columns -- chromosome, left position of the whole pair, right position of the whole pair-- and is NOT the same BEDPE format used by BEDTOOLS. To convert BAMPE to BEDPE, you can use this command `macs3 filterdup --keep-dup all -f BAMPE -i input.bam -o output.bedpe`. DEFAULT: \"BAMPE\"",
 862                              default="BAMPE")
 863
 864     # group for output files
 865     group_output = argparser_hmmratac.add_argument_group("Output arguments")
 866
 867     add_outdir_option(group_output)
 868     group_output.add_argument("-n", "--name", dest="name", type=str,
 869                               help="Name for this experiment, which will be used as a prefix to generate output file names. DEFAULT: \"NA\"",
 870                               default="NA")
 871     group_output.add_argument("--cutoff-analysis-only", dest="cutoff_analysis_only", action="store_true",
 872                               help="Only run the cutoff analysis and output a report. After generating the report, the process will stop. By default, the cutoff analysis will be included in the whole process, but won't quit after the report is generated. The report will help user decide the three crucial parameters for `-l`, `-u`, and `-c`. So it's highly recommanded to run this first! Please read the report and instructions in `Choices of cutoff values` on how to decide the three crucial parameters. The resolution of cutoff analysis can be controlled by --cutoff-analysis-max and --cutoff-analysis-steps options.",
 873                               default=False)
 874     group_output.add_argument("--cutoff-analysis-max", dest="cutoff_analysis_max", type=int,
 875                               help="The maximum cutoff score for performing cutoff analysis. Together with --cutoff-analysis-steps, the resolution in the final report can be controlled. Please check the description in --cutoff-analysis-steps for detail. DEFAULT: 100",
 876                               default=100)
 877     group_output.add_argument("--cutoff-analysis-steps", dest="cutoff_analysis_steps", type=int,
 878                               help="Steps for performing cutoff analysis. It will be used to decide which cutoff value should be included in the final report. Larger the value, higher resolution the cutoff analysis can be. The cutoff analysis function will first find the smallest (at least 0) and the largest (controlled by --cutoff-analysis-max) foldchange score in the data, then break the range of foldchange score into `CUTOFF_ANALYSIS_STEPS` intervals. It will then use each foldchange score as cutoff to call peaks and calculate the total number of candidate peaks, the total basepairs of peaks, and the average length of peak in basepair. Please note that the final report ideally should include `CUTOFF_ANALYSIS_STEPS` rows, but in practice, if the foldchange cutoff yield zero peak, the row for that foldchange value won't be included.  DEFAULT: 100",
 879                               default=100)
 880     group_output.add_argument("--save-digested", dest="save_digested", action="store_true",
 881                               help="Save the digested ATAC signals of short-, mono-, di-, and tri- signals in three BedGraph files with the names NAME_short.bdg, NAME_mono.bdg, NAME_di.bdg, and NAME_tri.bdg. DEFAULT: False",
 882                               default=False)
 883     group_output.add_argument("--save-states", dest="save_states", action="store_true",
 884                               help="Save all open and nucleosomal state annotations into a BED file with the name NAME_states.bed. DEFAULT: False",
 885                               default=False)
 886     group_output.add_argument("--save-likelihoods", dest="save_likelihoods", action="store_true",
 887                               help="Save the likelihoods to each state annotation in three BedGraph files, named with NAME_open.bdg for open states, NAME_nuc.bdg for nucleosomal states, and NAME_bg.bdg for the background states. DEFAULT: False",
 888                               default=False)
 889     # group_output.add_argument("--no-peaks", dest="store_peaks", action="store_true",
 890     #                           help="Do not report peaks in bed format. Default: false",
 891     #                           default=False)
 892     # group_output.add_argument("--printExclude", dest="print_exclude", action="store_true",
 893     #                           help="Output excluded regions into Output_exclude.bed. Default: False",
 894     #                           default=False)
 895     group_output.add_argument("--save-training-data", dest="save_train", action="store_true",
 896                               help="Save the training regions and training data into NAME_training_regions.bed and NAME_training_data.txt. Default: False",
 897                               default=False)
 898
 899     # group for EM
 900     group_em = argparser_hmmratac.add_argument_group("EM algorithm arguments")
 901     group_em.add_argument("--no-fragem", dest="em_skip", action="store_true",
 902                           help="Do not perform EM training on the fragment distribution. If set, EM_MEANS and EM.STDDEVS will be used instead. Default: False",
 903                           default=False)
 904     group_em.add_argument("--means", dest="em_means", type=float, nargs=4,
 905                           help="Comma separated list of initial mean values for the fragment distribution for short fragments, mono-, di-, and tri-nucleosomal fragments. Default: 50 200 400 600",
 906                           default=[50, 200, 400, 600])
 907     group_em.add_argument("--stddevs", dest="em_stddevs", type=float, nargs=4,
 908                           help="Comma separated list of initial standard deviation values for fragment distribution for short fragments, mono-, di-, and tri-nucleosomal fragments. Default: 20 20 20 20",
 909                           default=[20, 20, 20, 20])
 910     group_em.add_argument("--min-frag-p", dest="min_frag_p", type=float,
 911                           help="We will exclude the abnormal fragments that can't be assigned to any of the four signal tracks. After we use EM to find the means and stddevs of the four distributions, we will calculate the likelihood that a given fragment length fit any of the four using normal distribution. The criteria we will use is that if a fragment length has less than MIN_FRAG_P probability to be like either of short, mono, di, or tri-nuc fragment, we will exclude it while generating the four signal tracks for later HMM training and prediction. The value should be between 0 and 1. Larger the value, more abnormal fragments will be allowed. So if you want to include more 'ideal' fragments, make this value smaller. Default=0.001",
 912                           default=0.001)
 913     # group for HMM
 914     group_hmm = argparser_hmmratac.add_argument_group("Hidden Markov Model arguments")
 915     #group_hmm.add_argument("-s", "--states", dest="hmm_states", type=int,
 916     #                        help="Number of States in the model. Default=3. If not k=3, recommend NOT calling peaks, use bedgraph. This option is named as `--kmeans` in HMMRATAC since it will also control the number of clusters in the k-means clustering process to decide the initial emissions for HMM training.",
 917     #                        default=3)
 918     group_hmm.add_argument("--binsize", dest="hmm_binsize", type=int,
 919                            help="Size of the bins to split the pileup signals for training and decoding with Hidden Markov Model. Must >= 1. Smaller the binsize, higher the resolution of the results, slower the process. Default=10",
 920                            default=10)
 921     group_hmm.add_argument("-u", "--upper", dest="hmm_upper", type=int,
 922                            help="Upper limit on fold change range for choosing training sites. This is an important parameter for training so please read. The purpose of this parameter is to EXCLUDE those unusually highly enriched chromatin regions so we can get training samples in 'ordinary' regions instead. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The upper cutoff should be the cutoff in the cutoff analysis result that can capture some (typically hundreds of) extremely high enrichment and unusually wide peaks. Default: 20",
 923                            default=20)
 924     group_hmm.add_argument("-l", "--lower", dest="hmm_lower", type=int,
 925                            help="Lower limit on fold change range for choosing training sites. This is an important parameter for training so please read. The purpose of this parameter is to ONLY INCLUDE those chromatin regions having ordinary enrichment so we can get training samples to learn the common features through HMM. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The lower cutoff should be the cutoff in the cutoff analysis result that can capture moderate number ( about 10k) of peaks with normal width ( average length 500-1000bps long). Default: 10",
 926                            default=10)
 927     group_hmm.add_argument("--maxTrain", dest="hmm_maxTrain", type=int,
 928                            help="Maximum number of training regions to use. After we identify the training regions between `-l` and `-u`, the lower and upper cutoffs, we will randomly pick this number of regions for training. Default: 1000",
 929                            default=1000)
 930     group_hmm.add_argument("--training-flanking", dest="hmm_training_flanking", type=int, required=False,
 931                            help="Training regions will be expanded to both side with this number of basepairs. The purpose is to include more background regions. Default: 1000",
 932                            default=1000)
 933     group_hmm.add_argument("-t", "--training", dest="hmm_training_regions", type=str, required=False,
 934                            help="Filename of training regions (previously was BED_file) to use for training HMM, instead of using foldchange settings to select. Default: NA")
 935     #group_hmm.add_argument("-z", "--zscore", dest="hmm_zscore", type=int,
 936     #                        help="Zscored read depth to mask during Viterbi decoding. Default: 100",
 937     #                        default=100)
 938     #group_hmm.add_argument("--window", dest="hmm_window", type=int,
 939     #                        help="Size of the bins to split the genome into for Viterbi decoding. To save memory, the genome is split into WINDOW long bins and viterbi decoding occurs across each bin. Default=25000000. Note: For machines with limited memory, it is recommended to reduce the size of the bins.",
 940     #                        default=25000000)
 941     group_hmm.add_argument("--model", dest="hmm_file", type=str, required=False,
 942                            help="A JSON file generated from previous HMMRATAC run to use instead of creating new one. When provided, HMM training will be skipped. Default: NA")
 943     group_hmm.add_argument("--modelonly", dest="hmm_modelonly", action="store_true", default=False,
 944                            help="Stop the program after generating model. Use this option to generate HMM model ONLY, which can be later applied with `--model`. Default: False")
 945     group_hmm.add_argument("--hmm-type", dest="hmm_type", type=str, choices=("gaussian", "poisson"), default="gaussian",
 946                            help="Use --hmm-type to select a Gaussian ('gaussian') or Poisson ('poisson') model for the hidden markov model in HMMRATAC. Default: 'gaussian'.")
 947
 948     # group for peak calling arguments
 949     group_call = argparser_hmmratac.add_argument_group("Peak calling/HMM decoding arguments")
 950     group_call.add_argument("-c", "--prescan-cutoff", dest="prescan_cutoff", type=float,
 951                             help="The fold change cutoff for prescanning candidate regions in the whole dataset. Then we will use HMM to predict/decode states on these candidate regions. Higher the prescan cutoff, fewer regions will be considered. Must > 1. This is an important parameter for decoding so please read. The purpose of this parameter is to EXCLUDE those chromatin regions having noises/random enrichment so we can have a large number of possible regions to predict the HMM states. It's highly recommended to run the `--cutoff-analysis-only` first to decide the lower cutoff `-l`, the upper cutoff `-u`, and the pre-scanning cutoff `-c`. The pre-scanning cutoff should be the cutoff close to the BOTTOM of the cutoff analysis result that can capture large number of possible peaks with normal length (average length 500-1000bps). In most cases, please do not pick a cutoff too low that capture almost all the background noises from the data. Default: 1.2",
 952                             default=1.2)
 953
 954     group_call.add_argument("--minlen", dest="openregion_minlen", type=int,
 955                             help="Minimum length of open region to call accessible regions. Must be larger than 0. If it is set as 0, it means no filtering on the length of the open regions called. Please note that, when bin size is small, setting a too small OPENREGION_MINLEN will bring a lot of false positives. Default: 100",
 956                             default=100)
 957     # group_call.add_argument("--score", dest="call_score", type=str, choices=("max", "ave", "med", "fc", "zscore", "all"),
 958     #                         help="What type of score system to use for peaks. Can be used for ranking peaks. Default: max",
 959     #                         default="max")
 960     # group_call.add_argument("--threshold", dest="call_threshold", type=float,
 961     #                         help="Threshold for reporting peaks. Only peaks who's score is >= this value will be reported. Default: 100",
 962     #                         default=100)
 963     # group for misc
 964     group_misc = argparser_hmmratac.add_argument_group("Misc arguments")
 965     group_misc.add_argument("--pileup-short", dest="pileup_short", action="store_true",
 966                             help="By default, HMMRATAC will pileup all fragments in order to identify regions for training and candidate regions for decoding. When this option is on, it will pileup only the short fragments to do so. Although it sounds a good idea since we assume that open region should have a lot of short fragments, it may be possible that the overall short fragments are too few to be useful. Default: False",
 967                             default=False)
 968     group_misc.add_argument("--randomSeed", dest="hmm_randomSeed", type=int,
 969                             help="Seed to set for random sampling of training regions. Default: 10151",
 970                             default=10151)
 971     group_misc.add_argument("--decoding-steps", dest="decoding_steps", type=int, default=1000,
 972                             help="Number of candidate regions to be decoded at a time. The HMM model will be applied with Viterbi to find the optimal state path in each region. bigger the number, 'possibly' faster the decoding process, 'definitely' larger the memory usage. Default: 1000.")
 973     group_misc.add_argument("-e", "--blacklist", dest="blacklist", type=str, required=False,
 974                             help="Filename of blacklisted regions to exclude (previously was BED_file). Examples are those from ENCODE. Default: NA")
 975     group_misc.add_argument("--keep-duplicates", dest="misc_keep_duplicates", action="store_true",
 976                             help="Keep duplicate reads from analysis. By default, duplicate reads will be removed. Default: False",
 977                             default=False)
 978     # group_misc.add_argument("--trim", dest="misc_trim", type=int,
 979     #                         help="How many signals from the end to trim off (ie starting with tri signal then di etc). This may be useful if your data doesn't contain many large fragments. Default: 0",
 980     #                         default=0)
 981     # group_misc.add_argument("-m", "--multiple-processing", dest="np", type=int,
 982     #                         help="CPU used for mutliple processing. Please note that, assigning more CPUs does not guarantee the process being faster. Creating too many parrallel processes need memory operations and  may negate benefit from multi processing. Default: 1",
 983     #                         default=1)
 984     group_misc.add_argument("--verbose", dest="verbose", type=int,
 985                             help="Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2",
 986                             default=2)
 987     # group_misc.add_argument("-q", "--minmapq", dest="min_map_quality", type=int,
 988     #                           help="Minimum mapping quality of reads to keep. Default: 30",
 989     #                           default=30)
 990     group_misc.add_argument("--buffer-size", dest="buffer_size", type=int, default="100000",
 991                             help="Buffer size for incrementally increasing internal array size to store reads alignment information. In most cases, you don't have to change this parameter. However, if there are large number of chromosomes/contigs/scaffolds in your alignment, it's recommended to specify a smaller buffer size in order to decrease memory usage (but it will take longer time to read alignment files). Minimum memory requested for reading an alignment file is about # of CHROMOSOME * BUFFER_SIZE * 8 Bytes. DEFAULT: 100000 ")
 992
 993     return
 994
 995
 996 if __name__ == '__main__':
 997     __spec__ = None
 998     try:
 999         main()
1000     except KeyboardInterrupt:
1001         sys.stderr.write("User interrupted me! ;-) Bye!\n")
1002     except MemoryError:
1003         sys.stderr.write("MemoryError occurred. If your input file has a large number of contigs/chromosomes, decrease the buffer_size value by setting --buffer-size option.")