implement PETrackII for fragment files from scATAC experiments
[MACS.git] / MACS3 / Commands / filterdup_cmd.py
blob84fc1010ad4806417fbab7a3a100a51c5738138a
1 # Time-stamp: <2024-10-02 16:48:17 Tao Liu>
3 """Description: Filter duplicate reads depending on sequencing depth.
5 This code is free software; you can redistribute it and/or modify it
6 under the terms of the BSD License (see the file LICENSE included with
7 the distribution).
8 """
10 # ------------------------------------
11 # python modules
12 # ------------------------------------
14 import os
15 import sys
17 # ------------------------------------
18 # own python modules
19 # ------------------------------------
20 # from MACS3.Utilities.Constants import *
21 from MACS3.Utilities.OptValidator import opt_validate_filterdup
22 from MACS3.Signal.Prob import binomial_cdf_inv
23 # ------------------------------------
24 # Main function
25 # ------------------------------------
28 def run(o_options):
29 """The Main function/pipeline for duplication filter.
31 """
32 # Parse options...
33 options = opt_validate_filterdup(o_options)
34 # end of parsing commandline options
35 info = options.info
36 # warn = options.warn
37 # debug = options.debug
38 # error = options.error
40 options.PE_MODE = options.format in ('BAMPE', 'BEDPE')
42 if options.outputfile != "stdout":
43 outfhd = open(os.path.join(options.outdir, options.outputfile), "w")
44 else:
45 outfhd = sys.stdout
47 # 1 Read tag files
48 if options.PE_MODE:
49 info("# read input file in Paired-end mode.")
50 inputtrack = load_frag_files_options(options) # return PETrackI object
51 t0 = inputtrack.total # total fragments
52 info("# total fragments/pairs in alignment file: %d" % (t0))
53 else:
54 info("# read tag files...")
55 inputtrack = load_tag_files_options(options)
57 info("# tag size = %d" % options.tsize)
58 inputtrack.fw = options.tsize
60 t0 = inputtrack.total
61 info("# total tags in alignment file: %d" % (t0))
63 if options.keepduplicates != "all":
64 if options.keepduplicates == "auto":
65 info("calculate max duplicate tags in single position based on binomal distribution...")
66 max_dup_tags = cal_max_dup_tags(options.gsize,t0)
67 info(" max_dup_tags based on binomal = %d" % (max_dup_tags))
68 info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))
69 else:
70 info("user defined the maximum tags...")
71 max_dup_tags = int(options.keepduplicates)
72 info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags))
74 inputtrack.filter_dup(max_dup_tags)
75 t1 = inputtrack.total
77 info(" tags after filtering in alignment file: %d" % (t1))
78 info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0))
80 if not options.dryrun:
81 info("Write to BED file")
82 inputtrack.print_to_bed(fhd=outfhd)
83 info("finished! Check %s." % options.outputfile)
84 else:
85 info("Dry-run is finished!")
87 def cal_max_dup_tags(genome_size, tags_number, p=1e-5):
88 """Calculate the maximum duplicated tag number based on genome
89 size, total tag number and a p-value based on binomial
90 distribution. Brute force algorithm to calculate reverse CDF no
91 more than MAX_LAMBDA(100000).
93 """
94 return binomial_cdf_inv(1-p, tags_number, 1.0/genome_size)
97 def load_tag_files_options(options):
98 """From the options, load alignment tags.
101 options.info("# read treatment tags...")
102 tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
103 if not options.tsize: # override tsize if user specified --tsize
104 ttsize = tp.tsize()
105 options.tsize = ttsize
107 treat = tp.build_fwtrack()
108 if len(options.ifile) > 1:
109 # multiple input
110 for tfile in options.ifile[1:]:
111 tp = options.parser(tfile, buffer_size=options.buffer_size)
112 treat = tp.append_fwtrack(treat)
113 # treat.sort()
114 treat.finalize()
115 return treat
118 def load_frag_files_options(options):
119 """From the options, load treatment fragments and control fragments (if available).
122 options.info("# read treatment fragments...")
124 tp = options.parser(options.ifile[0], buffer_size=options.buffer_size)
125 treat = tp.build_petrack()
126 if len(options.ifile) > 1:
127 # multiple input
128 for tfile in options.ifile[1:]:
129 tp = options.parser(tfile, buffer_size=options.buffer_size)
130 treat = tp.append_petrack(treat)
131 treat.finalize()
132 return treat