1 # Time-stamp: <2024-10-02 15:58:36 Tao Liu>
3 """Description: MACS 3 call peak main executable
5 This code is free software; you can redistribute it and/or modify it
6 under the terms of the BSD License (see the file LICENSE included with
10 # ------------------------------------
12 # ------------------------------------
15 from time
import strftime
18 # ------------------------------------
19 # MACS3 python modules
20 # ------------------------------------
21 from MACS3
.Utilities
.Constants
import MACS_VERSION
, MAX_PAIRNUM
22 from MACS3
.Utilities
.OptValidator
import opt_validate_callpeak
23 from MACS3
.Signal
.Prob
import binomial_cdf_inv
24 from MACS3
.Signal
.PeakModel
import PeakModel
, NotEnoughPairsException
25 from MACS3
.Signal
.PeakDetect
import PeakDetect
26 from MACS3
.IO
.OutputWriter
import model2r_script
28 # ------------------------------------
30 # ------------------------------------
33 def check_names(treat
, control
, error_stream
):
34 """check common chromosome names"""
35 tchrnames
= set(treat
.get_chr_names())
36 cchrnames
= set(control
.get_chr_names())
37 commonnames
= tchrnames
.intersection(cchrnames
)
38 if len(commonnames
) == 0:
39 error_stream("No common chromosome names can be found from treatment and control!")
40 error_stream("Please make sure that the treatment and control alignment files were generated by using the same genome assembly!")
41 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames
)))
42 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames
)))
47 """The Main function/pipeline for MACS.
51 options
= opt_validate_callpeak(args
)
52 # end of parsing commandline options
59 info("\n"+options
.argtxt
)
60 options
.PE_MODE
= options
.format
in ('BAMPE', 'BEDPE')
62 tag
= 'fragment' # call things fragments not tags
66 tempfile
.tempdir
= options
.tempdir
69 info("#1 read %s files...", tag
)
71 (treat
, control
) = load_frag_files_options(options
)
73 (treat
, control
) = load_tag_files_options(options
)
74 if control
is not None:
75 # check if chromosome names are consistent. quit if not.
76 check_names(treat
, control
, error
)
78 info("#1 %s size = %.1f", tag
, options
.tsize
)
79 tagsinfo
= "# %s size is determined as %d bps\n" % (tag
, options
.tsize
)
82 tagsinfo
+= "# total %ss in treatment: %d\n" % (tag
, t0
)
83 info("#1 total %ss in treatment: %d", tag
, t0
)
86 if options
.keepduplicates
!= "all":
87 if options
.keepduplicates
== "auto":
88 info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag
)
89 treatment_max_dup_tags
= cal_max_dup_tags(options
.gsize
, t0
)
90 info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags
))
92 info("#1 user defined the maximum %ss...", tag
)
93 treatment_max_dup_tags
= int(options
.keepduplicates
)
95 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags
)
97 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags
)
99 treat
.filter_dup(treatment_max_dup_tags
)
101 info("#1 %ss after filtering in treatment: %d", tag
, t1
)
102 tagsinfo
+= "# %ss after filtering in treatment: %d\n" % (tag
, t1
)
104 tagsinfo
+= "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags
)
106 tagsinfo
+= "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags
)
107 info("#1 Redundant rate of treatment: %.2f", float(t0
- t1
) / t0
)
108 tagsinfo
+= "# Redundant rate in treatment: %.2f\n" % (float(t0
-t1
)/t0
)
112 if control
is not None:
114 tagsinfo
+= "# total %ss in control: %d\n" % (tag
, c0
)
115 info("#1 total %ss in control: %d", tag
, c0
)
117 if options
.keepduplicates
!= "all":
118 if options
.keepduplicates
== "auto":
119 info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag
)
120 control_max_dup_tags
= cal_max_dup_tags(options
.gsize
, c0
)
121 info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags
))
123 info("#1 user defined the maximum %ss...", tag
)
124 control_max_dup_tags
= int(options
.keepduplicates
)
126 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags
)
128 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags
)
129 control
.filter_dup(treatment_max_dup_tags
)
130 #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
133 info("#1 %ss after filtering in control: %d", tag
, c1
)
134 tagsinfo
+= "# %ss after filtering in control: %d\n" % (tag
, c1
)
136 tagsinfo
+= "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags
)
138 tagsinfo
+= "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags
)
140 info("#1 Redundant rate of control: %.2f" % (float(c0
-c1
)/c0
))
141 tagsinfo
+= "# Redundant rate in control: %.2f\n" % (float(c0
-c1
)/c0
)
147 info("#2 Build Peak Model...")
150 info("#2 Skipped...")
152 options
.d
= options
.tsize
154 options
.d
= options
.extsize
155 info("#2 Use %d as fragment length" % (options
.d
))
156 if options
.shift
> 0:
157 info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % options
.shift
)
158 elif options
.shift
< 0:
159 info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options
.shift
* -1))
160 options
.scanwindow
=2*options
.d
# remove the effect of --bw
162 peakmodel
= PeakModel(treatment
=treat
,
163 max_pairnum
=MAX_PAIRNUM
,
169 debug("#2 Summary Model:")
170 debug("#2 min_tags: %d" % (peakmodel
.min_tags
))
171 debug("#2 d: %d" % (peakmodel
.d
))
172 debug("#2 scan_window: %d" % (peakmodel
.scan_window
))
173 info("#2 predicted fragment length is %d bps" % peakmodel
.d
)
174 info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str, peakmodel
.alternative_d
)))
175 info("#2.2 Generate R script for model : %s" % (options
.modelR
))
176 model2r_script(peakmodel
, options
.modelR
, options
.name
)
177 options
.d
= peakmodel
.d
178 options
.scanwindow
= 2*options
.d
179 if options
.d
<= 2*options
.tsize
:
180 warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options
.d
))
182 options
.d
= options
.extsize
183 options
.scanwindow
= 2 * options
.d
184 warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options
.d
))
186 warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel
.alternative_d
)))
187 warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
189 except NotEnoughPairsException
:
190 if not options
.onauto
:
192 warn("#2 Skipped...")
193 options
.d
= options
.extsize
194 options
.scanwindow
= 2 * options
.d
195 warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options
.d
))
198 info("#3 Call peaks...")
200 info("# local lambda is disabled!")
202 if control
and options
.PE_MODE
:
203 c1
= c1
* 2 # in PE_MODE, PE data has to be doubled since both ends will be counted for calculating background noise.
205 # decide the scaling to balance the depth between treatment and control
207 if options
.downsample
:
208 # use random sampling to balance treatment and control
209 info("#3 User prefers to use random sampling instead of linear scaling.")
211 info("#3 MACS is random sampling treatment %ss...", tag
)
213 warn("#3 Your results may not be reproducible due to the random sampling!")
215 info("#3 Random seed (%d) is used." % options
.seed
)
216 treat
.sample_num(c1
, options
.seed
)
217 info("#3 %d Tags from treatment are kept", treat
.total
)
219 info("#3 MACS is random sampling control %ss...", tag
)
221 warn("#3 Your results may not be reproducible due to the random sampling!")
223 info("#3 Random seed (%d) is used." % options
.seed
)
224 control
.sample_num(t1
, options
.seed
)
225 info("#3 %d %ss from control are kept", control
.total
, tag
)
226 # set options.tocontrol although it would;t matter now
227 options
.tocontrol
= False
229 if options
.scaleto
== "large":
231 # treatment has more tags than control, since tolarge is
232 # true, we will scale control to treatment.
233 options
.tocontrol
= False
235 # treatment has less tags than control, since tolarge is
236 # true, we will scale treatment to control.
237 options
.tocontrol
= True
240 # treatment has more tags than control, since tolarge is
241 # false, we will scale treatment to control.
242 options
.tocontrol
= True
244 # treatment has less tags than control, since tolarge is
245 # false, we will scale control to treatment.
246 options
.tocontrol
= False
248 peakdetect
= PeakDetect(treat
=treat
,
252 peakdetect
.call_peaks()
254 # filter out low FE peaks
255 peakdetect
.peaks
.filter_fc(fc_low
=options
.fecutoff
)
259 info("#4 Write output xls file... %s" % (options
.peakxls
))
260 ofhd_xls
= open(options
.peakxls
, "w")
261 ofhd_xls
.write("# This file is generated by MACS version %s\n" % (MACS_VERSION
))
262 ofhd_xls
.write(options
.argtxt
+"\n")
263 ofhd_xls
.write(tagsinfo
)
264 if options
.shift
> 0:
265 ofhd_xls
.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options
.shift
))
266 elif options
.shift
< 0:
267 ofhd_xls
.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options
.shift
* -1))
269 ofhd_xls
.write("# d = %d\n" % (options
.d
))
271 ofhd_xls
.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel
.alternative_d
)))
273 # when --nomodel is used, there is no peakmodel object. Simply skip this line.
276 ofhd_xls
.write("# local lambda is disabled!\n")
277 # pass write method so we can print too, and include name
278 peakdetect
.peaks
.write_to_xls(ofhd_xls
, name
=options
.name
.encode())
282 if options
.log_pvalue
is not None:
283 score_column
= "pscore"
284 elif options
.log_qvalue
is not None:
285 score_column
= "qscore"
286 # 4.2 peaks in narrowPeak
287 if not options
.broad
:
288 info("#4 Write peak in narrowPeak format file... %s" % (options
.peakNarrowPeak
))
289 ofhd_bed
= open(options
.peakNarrowPeak
, "w")
290 peakdetect
.peaks
.write_to_narrowPeak(ofhd_bed
, name_prefix
=b
"%s_peak_", name
=options
.name
.encode(), score_column
=score_column
, trackline
=options
.trackline
)
292 # 4.2-2 summits in BED
293 info("#4 Write summits bed file... %s" % (options
.summitbed
))
294 ofhd_summits
= open(options
.summitbed
, "w")
295 peakdetect
.peaks
.write_to_summit_bed(ofhd_summits
,
296 name_prefix
="%s_peak_".encode(),
297 name
=options
.name
.encode(),
298 description
=("Summits for %s (Made with MACS v3, "
301 score_column
=score_column
,
302 trackline
=options
.trackline
)
304 # 4.2 broad peaks in bed12 or gappedPeak
306 info("#4 Write broad peak in broadPeak format file... %s" % (options
.peakBroadPeak
))
307 ofhd_bed
= open(options
.peakBroadPeak
, "w")
308 peakdetect
.peaks
.write_to_broadPeak(ofhd_bed
, name_prefix
=b
"%s_peak_", name
=options
.name
.encode(), description
=options
.name
.encode(), score_column
=score_column
, trackline
=options
.trackline
)
310 info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options
.peakGappedPeak
))
311 ofhd_bed
= open(options
.peakGappedPeak
, "w")
312 peakdetect
.peaks
.write_to_gappedPeak(ofhd_bed
, name_prefix
=b
"%s_peak_", name
=options
.name
.encode(), description
=options
.name
.encode(), score_column
=score_column
, trackline
=options
.trackline
)
318 def cal_max_dup_tags(genome_size
, tags_number
, p
=1e-5):
319 """Calculate the maximum duplicated tag number based on genome
320 size, total tag number and a p-value based on binomial
321 distribution. Brute force algorithm to calculate reverse CDF no
322 more than MAX_LAMBDA(100000).
325 return binomial_cdf_inv(1-p
, tags_number
, 1.0/genome_size
)
328 def load_frag_files_options(options
):
329 """From the options, load treatment fragments and control fragments (if available).
332 options
.info("#1 read treatment fragments...")
334 tp
= options
.parser(options
.tfile
[0], buffer_size
=options
.buffer_size
)
335 treat
= tp
.build_petrack()
336 if len(options
.tfile
) > 1:
338 for tfile
in options
.tfile
[1:]:
339 tp
= options
.parser(tfile
, buffer_size
=options
.buffer_size
)
340 treat
= tp
.append_petrack(treat
)
345 options
.info("#1.2 read input fragments...")
346 cp
= options
.parser(options
.cfile
[0], buffer_size
=options
.buffer_size
)
347 control
= cp
.build_petrack()
349 if len(options
.cfile
) > 1:
351 for cfile
in options
.cfile
[1:]:
352 cp
= options
.parser(cfile
, buffer_size
=options
.buffer_size
)
353 control
= cp
.append_petrack(control
)
357 options
.info("#1 mean fragment size is determined as %.1f bp from treatment" % options
.tsize
)
358 # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
359 if control
is not None:
360 options
.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d
)
361 return (treat
, control
)
364 def load_tag_files_options(options
):
365 """From the options, load treatment tags and control tags (if available).
368 options
.info("#1 read treatment tags...")
369 tp
= options
.parser(options
.tfile
[0], buffer_size
=options
.buffer_size
)
370 if not options
.tsize
: # override tsize if user specified --tsize
372 options
.tsize
= ttsize
373 treat
= tp
.build_fwtrack()
374 if len(options
.tfile
) > 1:
376 for tfile
in options
.tfile
[1:]:
377 tp
= options
.parser(tfile
, buffer_size
=options
.buffer_size
)
378 treat
= tp
.append_fwtrack(treat
)
382 options
.info("#1.2 read input tags...")
383 control
= options
.parser(options
.cfile
[0], buffer_size
=options
.buffer_size
).build_fwtrack()
384 if len(options
.cfile
) > 1:
386 for cfile
in options
.cfile
[1:]:
387 cp
= options
.parser(cfile
, buffer_size
=options
.buffer_size
)
388 control
= cp
.append_fwtrack(control
)
392 options
.info("#1 tag size is determined as %d bps" % options
.tsize
)
393 return (treat
, control
)