1 # Time-stamp: <2019-11-04 10:41:13 taoliu>
3 """Description: MACS 2 main executable
5 This code is free software; you can redistribute it and/or modify it
6 under the terms of the BSD License (see the file LICENSE included with
10 # ------------------------------------
12 # ------------------------------------
17 from time
import strftime
20 # ------------------------------------
22 # ------------------------------------
23 from MACS2
.OptValidator
import opt_validate
24 from MACS2
.Prob
import binomial_cdf_inv
25 from MACS2
.PeakModel
import PeakModel
,NotEnoughPairsException
26 from MACS2
.PeakDetect
import PeakDetect
27 from MACS2
.OutputWriter
import model2r_script
28 from MACS2
.Constants
import *
29 # ------------------------------------
31 # ------------------------------------
32 def check_names(treat
, control
, error_stream
):
33 """check common chromosome names"""
34 tchrnames
= set(treat
.get_chr_names())
35 cchrnames
= set(control
.get_chr_names())
36 commonnames
= tchrnames
.intersection(cchrnames
)
37 if len(commonnames
)==0:
38 error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
39 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames
)))
40 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames
)))
44 """The Main function/pipeline for MACS.
48 options
= opt_validate( args
)
49 # end of parsing commandline options
55 info("\n"+options
.argtxt
)
56 options
.PE_MODE
= options
.format
in ('BAMPE','BEDPE')
57 if options
.PE_MODE
: tag
= 'fragment' # call things fragments not tags
60 tempfile
.tempdir
= options
.tempdir
63 info("#1 read %s files...", tag
)
64 if options
.PE_MODE
: (treat
, control
) = load_frag_files_options (options
)
65 else: (treat
, control
) = load_tag_files_options (options
)
66 if control
is not None: check_names(treat
, control
, error
)
68 info("#1 %s size = %.1f", tag
, options
.tsize
)
69 tagsinfo
= "# %s size is determined as %d bps\n" % (tag
, options
.tsize
)
72 tagsinfo
+= "# total %ss in treatment: %d\n" % (tag
, t0
)
73 info("#1 total %ss in treatment: %d", tag
, t0
)
75 # options.filteringmodel = True
76 # if options.filteringmodel:
77 # treat.separate_dups()
78 # t0 = treat.total + treat.dups.total
80 # info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
81 # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
82 # elif options.keepduplicates != "all":
83 if options
.keepduplicates
!= "all":
84 if options
.keepduplicates
== "auto":
85 info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag
)
86 treatment_max_dup_tags
= cal_max_dup_tags(options
.gsize
,t0
)
87 info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags
))
89 info("#1 user defined the maximum %ss...", tag
)
90 treatment_max_dup_tags
= int(options
.keepduplicates
)
92 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags
)
94 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags
)
95 #treat.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to addbackdup+refinepeak anymore
96 treat
.filter_dup(treatment_max_dup_tags
)
98 info("#1 %ss after filtering in treatment: %d", tag
, t1
)
99 tagsinfo
+= "# %ss after filtering in treatment: %d\n" % (tag
, t1
)
101 tagsinfo
+= "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags
)
103 tagsinfo
+= "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags
)
104 info("#1 Redundant rate of treatment: %.2f", float(t0
- t1
) / t0
)
105 tagsinfo
+= "# Redundant rate in treatment: %.2f\n" % (float(t0
-t1
)/t0
)
109 if control
is not None:
111 tagsinfo
+= "# total %ss in control: %d\n" % (tag
, c0
)
112 info("#1 total %ss in control: %d", tag
, c0
)
114 #if options.filteringmodel:
115 # control.separate_dups()
116 # c0 = treat.total + treat.dups.total
118 # info("#1 Redundant rate of treatment: %.2f", float(c0 - c1) / c0)
119 # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0)
120 #elif options.keepduplicates != "all":
121 if options
.keepduplicates
!= "all":
122 if options
.keepduplicates
== "auto":
123 info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag
)
124 control_max_dup_tags
= cal_max_dup_tags(options
.gsize
,c0
)
125 info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags
))
127 info("#1 user defined the maximum %ss...", tag
)
128 control_max_dup_tags
= int(options
.keepduplicates
)
130 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags
)
132 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags
)
133 control
.filter_dup(treatment_max_dup_tags
)
134 #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
137 info("#1 %ss after filtering in control: %d", tag
, c1
)
138 tagsinfo
+= "# %ss after filtering in control: %d\n" % (tag
, c1
)
140 tagsinfo
+= "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags
)
142 tagsinfo
+= "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags
)
144 info("#1 Redundant rate of control: %.2f" % (float(c0
-c1
)/c0
))
145 tagsinfo
+= "# Redundant rate in control: %.2f\n" % (float(c0
-c1
)/c0
)
151 info("#2 Build Peak Model...")
154 info("#2 Skipped...")
156 #options.shiftsize = 0
157 options
.d
= options
.tsize
159 options
.d
=options
.extsize
160 info("#2 Use %d as fragment length" % (options
.d
))
161 if options
.shift
> 0:
162 info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options
.shift
))
163 elif options
.shift
< 0:
164 info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options
.shift
* -1))
165 options
.scanwindow
=2*options
.d
# remove the effect of --bw
168 peakmodel
= PeakModel(treatment
= treat
,
169 max_pairnum
= MAX_PAIRNUM
,
173 debug("#2 Summary Model:")
174 debug("#2 min_tags: %d" % (peakmodel
.min_tags
))
175 debug("#2 d: %d" % (peakmodel
.d
))
176 debug("#2 scan_window: %d" % (peakmodel
.scan_window
))
177 info("#2 predicted fragment length is %d bps" % peakmodel
.d
)
178 info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel
.alternative_d
)))
179 info("#2.2 Generate R script for model : %s" % (options
.modelR
))
180 model2r_script(peakmodel
,options
.modelR
,options
.name
)
181 options
.d
= peakmodel
.d
182 options
.scanwindow
= 2*options
.d
183 if options
.d
<= 2*options
.tsize
:
184 warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options
.d
))
186 options
.d
=options
.extsize
187 options
.scanwindow
=2*options
.d
188 warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options
.d
))
190 warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel
.alternative_d
)))
191 warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
193 except NotEnoughPairsException
:
194 if not options
.onauto
:
196 warn("#2 Skipped...")
197 options
.d
=options
.extsize
198 options
.scanwindow
=2*options
.d
199 warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options
.d
))
202 info("#3 Call peaks...")
204 info("# local lambda is disabled!")
206 # decide options.tocontrol according to options.tolarge
207 if control
and options
.PE_MODE
:
211 if options
.downsample
:
212 # use random sampling to balance treatment and control
213 info("#3 User prefers to use random sampling instead of linear scaling.")
215 info("#3 MACS is random sampling treatment %ss...", tag
)
217 warn("#3 Your results may not be reproducible due to the random sampling!")
219 info("#3 Random seed (%d) is used." % options
.seed
)
220 treat
.sample_num(c1
, options
.seed
)
221 info("#3 %d Tags from treatment are kept", treat
.total
)
223 info("#3 MACS is random sampling control %ss...", tag
)
225 warn("#3 Your results may not be reproducible due to the random sampling!")
227 info("#3 Random seed (%d) is used." % options
.seed
)
228 control
.sample_num(t1
, options
.seed
)
229 info("#3 %d %ss from control are kept", control
.total
, tag
)
230 # set options.tocontrol although it would;t matter now
231 options
.tocontrol
= False
233 if options
.scaleto
== "large":
235 # treatment has more tags than control, since tolarge is
236 # true, we will scale control to treatment.
237 options
.tocontrol
= False
239 # treatment has less tags than control, since tolarge is
240 # true, we will scale treatment to control.
241 options
.tocontrol
= True
244 # treatment has more tags than control, since tolarge is
245 # false, we will scale treatment to control.
246 options
.tocontrol
= True
248 # treatment has less tags than control, since tolarge is
249 # false, we will scale control to treatment.
250 options
.tocontrol
= False
252 peakdetect
= PeakDetect(treat
= treat
,
256 peakdetect
.call_peaks()
258 # filter out low FE peaks
259 peakdetect
.peaks
.filter_fc( fc_low
= options
.fecutoff
)
261 #call refinepeak if needed.
262 # if options.refine_peaks:
263 # info("#3 now put back duplicate reads...")
264 # treat.addback_dups()
265 # info("#3 calculate reads balance to refine peak summits...")
266 # refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 )
267 # info("#3 reassign scores for newly refined peak summits...")
268 # peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace
269 # #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" )
270 # #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w")
271 # #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline )
273 #diag_result = peakdetect.diag_result()
276 info("#4 Write output xls file... %s" % (options
.peakxls
))
277 ofhd_xls
= open( options
.peakxls
, "w" )
278 ofhd_xls
.write("# This file is generated by MACS version %s\n" % (MACS_VERSION
))
279 ofhd_xls
.write(options
.argtxt
+"\n")
281 ofhd_xls
.write(tagsinfo
)
283 if options
.shift
> 0:
284 ofhd_xls
.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options
.shift
))
285 elif options
.shift
< 0:
286 ofhd_xls
.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options
.shift
* -1))
288 ofhd_xls
.write("# d = %d\n" % (options
.d
))
290 ofhd_xls
.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel
.alternative_d
)))
292 # when --nomodel is used, there is no peakmodel object. Simply skip this line.
295 ofhd_xls
.write("# local lambda is disabled!\n")
296 # pass write method so we can print too, and include name
297 peakdetect
.peaks
.write_to_xls(ofhd_xls
, name
= options
.name
.encode())
300 if options
.log_pvalue
!= None:
301 score_column
= "pscore"
302 elif options
.log_qvalue
!= None:
303 score_column
= "qscore"
304 #4.2 peaks in narrowPeak
305 if not options
.broad
:
306 #info("#4 Write peak bed file... %s" % (options.peakbed))
307 #ofhd_bed = open(options.peakbed,"w")
308 #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline)
310 info("#4 Write peak in narrowPeak format file... %s" % (options
.peakNarrowPeak
))
311 ofhd_bed
= open( options
.peakNarrowPeak
, "w" )
312 peakdetect
.peaks
.write_to_narrowPeak (ofhd_bed
, name_prefix
=b
"%s_peak_", name
=options
.name
.encode(), score_column
=score_column
, trackline
=options
.trackline
)
314 #4.2-2 summits in BED
315 info("#4 Write summits bed file... %s" % (options
.summitbed
))
316 ofhd_summits
= open( options
.summitbed
, "w" )
317 peakdetect
.peaks
.write_to_summit_bed (ofhd_summits
, name_prefix
="%s_peak_".encode(), name
=options
.name
.encode(),
318 description
=("Summits for %s (Made with MACS v2, " + strftime("%x") + ")").encode(),
319 score_column
=score_column
, trackline
=options
.trackline
)
321 #4.2 broad peaks in bed12 or gappedPeak
323 info("#4 Write broad peak in broadPeak format file... %s" % (options
.peakBroadPeak
))
324 ofhd_bed
= open( options
.peakBroadPeak
, "w" )
325 peakdetect
.peaks
.write_to_broadPeak (ofhd_bed
, name_prefix
=b
"%s_peak_", name
=options
.name
.encode(), description
=options
.name
.encode(), score_column
=score_column
, trackline
=options
.trackline
)
327 info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options
.peakGappedPeak
))
328 ofhd_bed
= open( options
.peakGappedPeak
, "w" )
329 peakdetect
.peaks
.write_to_gappedPeak (ofhd_bed
, name_prefix
=b
"%s_peak_", name
=options
.name
.encode(), description
=options
.name
.encode(), score_column
=score_column
, trackline
=options
.trackline
)
334 def cal_max_dup_tags ( genome_size
, tags_number
, p
=1e-5 ):
335 """Calculate the maximum duplicated tag number based on genome
336 size, total tag number and a p-value based on binomial
337 distribution. Brute force algorithm to calculate reverse CDF no
338 more than MAX_LAMBDA(100000).
341 return binomial_cdf_inv(1-p
,tags_number
,1.0/genome_size
)
343 def load_frag_files_options ( options
):
344 """From the options, load treatment fragments and control fragments (if available).
347 options
.info("#1 read treatment fragments...")
349 tp
= options
.parser(options
.tfile
[0], buffer_size
=options
.buffer_size
)
350 treat
= tp
.build_petrack()
352 if len(options
.tfile
) > 1:
354 for tfile
in options
.tfile
[1:]:
355 tp
= options
.parser(tfile
, buffer_size
=options
.buffer_size
)
356 treat
= tp
.append_petrack( treat
)
362 options
.info("#1.2 read input fragments...")
363 cp
= options
.parser(options
.cfile
[0], buffer_size
=options
.buffer_size
)
364 control
= cp
.build_petrack()
367 if len(options
.cfile
) > 1:
369 for cfile
in options
.cfile
[1:]:
370 cp
= options
.parser(cfile
, buffer_size
=options
.buffer_size
)
371 control
= cp
.append_petrack( control
)
376 options
.info("#1 mean fragment size is determined as %.1f bp from treatment" % options
.tsize
)
377 # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
378 if control
is not None:
379 options
.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d
)
380 return (treat
, control
)
382 def load_tag_files_options ( options
):
383 """From the options, load treatment tags and control tags (if available).
386 options
.info("#1 read treatment tags...")
387 tp
= options
.parser(options
.tfile
[0], buffer_size
=options
.buffer_size
)
388 if not options
.tsize
: # override tsize if user specified --tsize
390 options
.tsize
= ttsize
391 treat
= tp
.build_fwtrack()
393 if len(options
.tfile
) > 1:
395 for tfile
in options
.tfile
[1:]:
396 tp
= options
.parser(tfile
, buffer_size
=options
.buffer_size
)
397 treat
= tp
.append_fwtrack( treat
)
402 options
.info("#1.2 read input tags...")
403 control
= options
.parser(options
.cfile
[0], buffer_size
=options
.buffer_size
).build_fwtrack()
405 if len(options
.cfile
) > 1:
407 for cfile
in options
.cfile
[1:]:
408 cp
= options
.parser(cfile
, buffer_size
=options
.buffer_size
)
409 control
= cp
.append_fwtrack( control
)
414 options
.info("#1 tag size is determined as %d bps" % options
.tsize
)
415 return (treat
, control
)