1 # Time-stamp: <2024-10-02 16:48:17 Tao Liu>
3 """Description: Filter duplicate reads depending on sequencing depth.
5 This code is free software; you can redistribute it and/or modify it
6 under the terms of the BSD License (see the file LICENSE included with
10 # ------------------------------------
12 # ------------------------------------
17 # ------------------------------------
19 # ------------------------------------
20 # from MACS3.Utilities.Constants import *
21 from MACS3
.Utilities
.OptValidator
import opt_validate_filterdup
22 from MACS3
.Signal
.Prob
import binomial_cdf_inv
23 # ------------------------------------
25 # ------------------------------------
29 """The Main function/pipeline for duplication filter.
33 options
= opt_validate_filterdup(o_options
)
34 # end of parsing commandline options
37 # debug = options.debug
38 # error = options.error
40 options
.PE_MODE
= options
.format
in ('BAMPE', 'BEDPE')
42 if options
.outputfile
!= "stdout":
43 outfhd
= open(os
.path
.join(options
.outdir
, options
.outputfile
), "w")
49 info("# read input file in Paired-end mode.")
50 inputtrack
= load_frag_files_options(options
) # return PETrackI object
51 t0
= inputtrack
.total
# total fragments
52 info("# total fragments/pairs in alignment file: %d" % (t0
))
54 info("# read tag files...")
55 inputtrack
= load_tag_files_options(options
)
57 info("# tag size = %d" % options
.tsize
)
58 inputtrack
.fw
= options
.tsize
61 info("# total tags in alignment file: %d" % (t0
))
63 if options
.keepduplicates
!= "all":
64 if options
.keepduplicates
== "auto":
65 info("calculate max duplicate tags in single position based on binomal distribution...")
66 max_dup_tags
= cal_max_dup_tags(options
.gsize
,t0
)
67 info(" max_dup_tags based on binomal = %d" % (max_dup_tags
))
68 info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags
))
70 info("user defined the maximum tags...")
71 max_dup_tags
= int(options
.keepduplicates
)
72 info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags
))
74 inputtrack
.filter_dup(max_dup_tags
)
77 info(" tags after filtering in alignment file: %d" % (t1
))
78 info(" Redundant rate of alignment file: %.2f" % (float(t0
-t1
)/t0
))
80 if not options
.dryrun
:
81 info("Write to BED file")
82 inputtrack
.print_to_bed(fhd
=outfhd
)
83 info("finished! Check %s." % options
.outputfile
)
85 info("Dry-run is finished!")
87 def cal_max_dup_tags(genome_size
, tags_number
, p
=1e-5):
88 """Calculate the maximum duplicated tag number based on genome
89 size, total tag number and a p-value based on binomial
90 distribution. Brute force algorithm to calculate reverse CDF no
91 more than MAX_LAMBDA(100000).
94 return binomial_cdf_inv(1-p
, tags_number
, 1.0/genome_size
)
97 def load_tag_files_options(options
):
98 """From the options, load alignment tags.
101 options
.info("# read treatment tags...")
102 tp
= options
.parser(options
.ifile
[0], buffer_size
=options
.buffer_size
)
103 if not options
.tsize
: # override tsize if user specified --tsize
105 options
.tsize
= ttsize
107 treat
= tp
.build_fwtrack()
108 if len(options
.ifile
) > 1:
110 for tfile
in options
.ifile
[1:]:
111 tp
= options
.parser(tfile
, buffer_size
=options
.buffer_size
)
112 treat
= tp
.append_fwtrack(treat
)
118 def load_frag_files_options(options
):
119 """From the options, load treatment fragments and control fragments (if available).
122 options
.info("# read treatment fragments...")
124 tp
= options
.parser(options
.ifile
[0], buffer_size
=options
.buffer_size
)
125 treat
= tp
.build_petrack()
126 if len(options
.ifile
) > 1:
128 for tfile
in options
.ifile
[1:]:
129 tp
= options
.parser(tfile
, buffer_size
=options
.buffer_size
)
130 treat
= tp
.append_petrack(treat
)