MACS3/Commands/filterdup_cmd.py

   1 # Time-stamp: <2024-10-02 16:48:17 Tao Liu>
   2
   3 """Description: Filter duplicate reads depending on sequencing depth.
   4
   5 This code is free software; you can redistribute it and/or modify it
   6 under the terms of the BSD License (see the file LICENSE included with
   7 the distribution).
   8 """
   9
  10 # ------------------------------------
  11 # python modules
  12 # ------------------------------------
  13
  14 import os
  15 import sys
  16
  17 # ------------------------------------
  18 # own python modules
  19 # ------------------------------------
  20 # from MACS3.Utilities.Constants import *
  21 from MACS3.Utilities.OptValidator import opt_validate_filterdup
  22 from MACS3.Signal.Prob import binomial_cdf_inv
  23 # ------------------------------------
  24 # Main function
  25 # ------------------------------------
  26
  27
  28 def run(o_options):
  29     """The Main function/pipeline for duplication filter.
  30
  31     """
  32     # Parse options...
  33     options = opt_validate_filterdup(o_options)
  34     # end of parsing commandline options
  35     info = options.info
  36     # warn = options.warn
  37     # debug = options.debug
  38     # error = options.error
  39
  40     options.PE_MODE = options.format in ('BAMPE', 'BEDPE')
  41
  42     if options.outputfile != "stdout":
  43         outfhd = open(os.path.join(options.outdir, options.outputfile), "w")
  44     else:
  45         outfhd = sys.stdout
  46
  47     # 1 Read tag files
  48     if options.PE_MODE:
  49         info("# read input file in Paired-end mode.")
  50         inputtrack = load_frag_files_options(options)  # return PETrackI object
  51         t0 = inputtrack.total   # total fragments
  52         info("# total fragments/pairs in alignment file: %d" % (t0))
  53     else:
  54         info("# read tag files...")
  55         inputtrack = load_tag_files_options(options)
  56
  57         info("# tag size = %d" % options.tsize)
  58         inputtrack.fw = options.tsize
  59
  60         t0 = inputtrack.total
  61         info("# total tags in alignment file: %d" % (t0))
  62
  63     if options.keepduplicates != "all":
  64         if options.keepduplicates == "auto":
  65             info("calculate max duplicate tags in single position based on binomal distribution...")
  66             max_dup_tags = cal_max_dup_tags(options.gsize,t0)
  67             info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
  68             info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))
  69         else:
  70             info("user defined the maximum tags...")
  71             max_dup_tags = int(options.keepduplicates)
  72             info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))
  73
  74         inputtrack.filter_dup(max_dup_tags)
  75         t1 = inputtrack.total
  76
  77         info(" tags after filtering in alignment file: %d" % (t1))
  78         info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0))
  79
  80     if not options.dryrun:
  81         info("Write to BED file")
  82         inputtrack.print_to_bed(fhd=outfhd)
  83         info("finished! Check %s." % options.outputfile)
  84     else:
  85         info("Dry-run is finished!")
  86
  87 def cal_max_dup_tags(genome_size, tags_number, p=1e-5):
  88     """Calculate the maximum duplicated tag number based on genome
  89     size, total tag number and a p-value based on binomial
  90     distribution. Brute force algorithm to calculate reverse CDF no
  91     more than MAX_LAMBDA(100000).
  92
  93     """
  94     return binomial_cdf_inv(1-p, tags_number, 1.0/genome_size)
  95
  96
  97 def load_tag_files_options(options):
  98     """From the options, load alignment tags.
  99
 100     """
 101     options.info("# read treatment tags...")
 102     tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
 103     if not options.tsize:           # override tsize if user specified --tsize
 104         ttsize = tp.tsize()
 105         options.tsize = ttsize
 106
 107     treat = tp.build_fwtrack()
 108     if len(options.ifile) > 1:
 109         # multiple input
 110         for tfile in options.ifile[1:]:
 111             tp = options.parser(tfile, buffer_size=options.buffer_size)
 112             treat = tp.append_fwtrack(treat)
 113             # treat.sort()
 114     treat.finalize()
 115     return treat
 116
 117
 118 def load_frag_files_options(options):
 119     """From the options, load treatment fragments and control fragments (if available).
 120
 121     """
 122     options.info("# read treatment fragments...")
 123
 124     tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
 125     treat = tp.build_petrack()
 126     if len(options.ifile) > 1:
 127         # multiple input
 128         for tfile in options.ifile[1:]:
 129             tp = options.parser(tfile, buffer_size=options.buffer_size)
 130             treat = tp.append_petrack(treat)
 131     treat.finalize()
 132     return treat