implement PETrackII for fragment files from scATAC experiments
[MACS.git] / MACS3 / Commands / callpeak_cmd.py
blob024f539e86e586e04ac5c476ba0ca648f4486869
1 # Time-stamp: <2024-10-02 15:58:36 Tao Liu>
3 """Description: MACS 3 call peak main executable
5 This code is free software; you can redistribute it and/or modify it
6 under the terms of the BSD License (see the file LICENSE included with
7 the distribution).
8 """
10 # ------------------------------------
11 # python modules
12 # ------------------------------------
14 import sys
15 from time import strftime
16 import tempfile
18 # ------------------------------------
19 # MACS3 python modules
20 # ------------------------------------
21 from MACS3.Utilities.Constants import MACS_VERSION, MAX_PAIRNUM
22 from MACS3.Utilities.OptValidator import opt_validate_callpeak
23 from MACS3.Signal.Prob import binomial_cdf_inv
24 from MACS3.Signal.PeakModel import PeakModel, NotEnoughPairsException
25 from MACS3.Signal.PeakDetect import PeakDetect
26 from MACS3.IO.OutputWriter import model2r_script
28 # ------------------------------------
29 # Main function
30 # ------------------------------------
33 def check_names(treat, control, error_stream):
34 """check common chromosome names"""
35 tchrnames = set(treat.get_chr_names())
36 cchrnames = set(control.get_chr_names())
37 commonnames = tchrnames.intersection(cchrnames)
38 if len(commonnames) == 0:
39 error_stream("No common chromosome names can be found from treatment and control!")
40 error_stream("Please make sure that the treatment and control alignment files were generated by using the same genome assembly!")
41 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
42 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
43 sys.exit()
46 def run(args):
47 """The Main function/pipeline for MACS.
49 """
50 # Parse options...
51 options = opt_validate_callpeak(args)
52 # end of parsing commandline options
53 info = options.info
54 warn = options.warn
55 debug = options.debug
56 error = options.error
58 # 0 output arguments
59 info("\n"+options.argtxt)
60 options.PE_MODE = options.format in ('BAMPE', 'BEDPE')
61 if options.PE_MODE:
62 tag = 'fragment' # call things fragments not tags
63 else:
64 tag = 'tag'
66 tempfile.tempdir = options.tempdir
68 # 1 Read tag files
69 info("#1 read %s files...", tag)
70 if options.PE_MODE:
71 (treat, control) = load_frag_files_options(options)
72 else:
73 (treat, control) = load_tag_files_options(options)
74 if control is not None:
75 # check if chromosome names are consistent. quit if not.
76 check_names(treat, control, error)
78 info("#1 %s size = %.1f", tag, options.tsize)
79 tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize)
81 t0 = treat.total
82 tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
83 info("#1 total %ss in treatment: %d", tag, t0)
85 # handle duplicates
86 if options.keepduplicates != "all":
87 if options.keepduplicates == "auto":
88 info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
89 treatment_max_dup_tags = cal_max_dup_tags(options.gsize, t0)
90 info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
91 else:
92 info("#1 user defined the maximum %ss...", tag)
93 treatment_max_dup_tags = int(options.keepduplicates)
94 if options.PE_MODE:
95 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
96 else:
97 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
99 treat.filter_dup(treatment_max_dup_tags)
100 t1 = treat.total
101 info("#1 %ss after filtering in treatment: %d", tag, t1)
102 tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
103 if options.PE_MODE:
104 tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
105 else:
106 tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
107 info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
108 tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
109 else:
110 t1 = t0
112 if control is not None:
113 c0 = control.total
114 tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
115 info("#1 total %ss in control: %d", tag, c0)
117 if options.keepduplicates != "all":
118 if options.keepduplicates == "auto":
119 info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
120 control_max_dup_tags = cal_max_dup_tags(options.gsize, c0)
121 info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags))
122 else:
123 info("#1 user defined the maximum %ss...", tag)
124 control_max_dup_tags = int(options.keepduplicates)
125 if options.PE_MODE:
126 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
127 else:
128 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
129 control.filter_dup(treatment_max_dup_tags)
130 #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
131 c1 = control.total
133 info("#1 %ss after filtering in control: %d", tag, c1)
134 tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
135 if options.PE_MODE:
136 tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
137 else:
138 tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)
140 info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0))
141 tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
142 else:
143 c1 = c0
144 info("#1 finished!")
146 # 2 Build Model
147 info("#2 Build Peak Model...")
149 if options.nomodel:
150 info("#2 Skipped...")
151 if options.PE_MODE:
152 options.d = options.tsize
153 else:
154 options.d = options.extsize
155 info("#2 Use %d as fragment length" % (options.d))
156 if options.shift > 0:
157 info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % options.shift)
158 elif options.shift < 0:
159 info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1))
160 options.scanwindow=2*options.d # remove the effect of --bw
161 else:
162 peakmodel = PeakModel(treatment=treat,
163 max_pairnum=MAX_PAIRNUM,
164 opt=options
166 try:
167 peakmodel.build()
168 info("#2 finished!")
169 debug("#2 Summary Model:")
170 debug("#2 min_tags: %d" % (peakmodel.min_tags))
171 debug("#2 d: %d" % (peakmodel.d))
172 debug("#2 scan_window: %d" % (peakmodel.scan_window))
173 info("#2 predicted fragment length is %d bps" % peakmodel.d)
174 info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str, peakmodel.alternative_d)))
175 info("#2.2 Generate R script for model : %s" % (options.modelR))
176 model2r_script(peakmodel, options.modelR, options.name)
177 options.d = peakmodel.d
178 options.scanwindow = 2*options.d
179 if options.d <= 2*options.tsize:
180 warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
181 if options.onauto:
182 options.d = options.extsize
183 options.scanwindow = 2 * options.d
184 warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d))
185 else:
186 warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d)))
187 warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
189 except NotEnoughPairsException:
190 if not options.onauto:
191 sys.exit(1)
192 warn("#2 Skipped...")
193 options.d = options.extsize
194 options.scanwindow = 2 * options.d
195 warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d))
197 # 3 Call Peaks
198 info("#3 Call peaks...")
199 if options.nolambda:
200 info("# local lambda is disabled!")
202 if control and options.PE_MODE:
203 c1 = c1 * 2 # in PE_MODE, PE data has to be doubled since both ends will be counted for calculating background noise.
205 # decide the scaling to balance the depth between treatment and control
206 if control:
207 if options.downsample:
208 # use random sampling to balance treatment and control
209 info("#3 User prefers to use random sampling instead of linear scaling.")
210 if t1 > c1:
211 info("#3 MACS is random sampling treatment %ss...", tag)
212 if options.seed < 0:
213 warn("#3 Your results may not be reproducible due to the random sampling!")
214 else:
215 info("#3 Random seed (%d) is used." % options.seed)
216 treat.sample_num(c1, options.seed)
217 info("#3 %d Tags from treatment are kept", treat.total)
218 elif c1 > t1:
219 info("#3 MACS is random sampling control %ss...", tag)
220 if options.seed < 0:
221 warn("#3 Your results may not be reproducible due to the random sampling!")
222 else:
223 info("#3 Random seed (%d) is used." % options.seed)
224 control.sample_num(t1, options.seed)
225 info("#3 %d %ss from control are kept", control.total, tag)
226 # set options.tocontrol although it would;t matter now
227 options.tocontrol = False
228 else:
229 if options.scaleto == "large":
230 if t1 > c1:
231 # treatment has more tags than control, since tolarge is
232 # true, we will scale control to treatment.
233 options.tocontrol = False
234 else:
235 # treatment has less tags than control, since tolarge is
236 # true, we will scale treatment to control.
237 options.tocontrol = True
238 else:
239 if t1 > c1:
240 # treatment has more tags than control, since tolarge is
241 # false, we will scale treatment to control.
242 options.tocontrol = True
243 else:
244 # treatment has less tags than control, since tolarge is
245 # false, we will scale control to treatment.
246 options.tocontrol = False
248 peakdetect = PeakDetect(treat=treat,
249 control=control,
250 opt=options
252 peakdetect.call_peaks()
254 # filter out low FE peaks
255 peakdetect.peaks.filter_fc(fc_low=options.fecutoff)
257 #4 output
258 #4.1 peaks in XLS
259 info("#4 Write output xls file... %s" % (options.peakxls))
260 ofhd_xls = open(options.peakxls, "w")
261 ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
262 ofhd_xls.write(options.argtxt+"\n")
263 ofhd_xls.write(tagsinfo)
264 if options.shift > 0:
265 ofhd_xls.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift))
266 elif options.shift < 0:
267 ofhd_xls.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1))
269 ofhd_xls.write("# d = %d\n" % (options.d))
270 try:
271 ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
272 except:
273 # when --nomodel is used, there is no peakmodel object. Simply skip this line.
274 pass
275 if options.nolambda:
276 ofhd_xls.write("# local lambda is disabled!\n")
277 # pass write method so we can print too, and include name
278 peakdetect.peaks.write_to_xls(ofhd_xls, name=options.name.encode())
279 ofhd_xls.close()
281 #4.2 peaks in BED
282 if options.log_pvalue is not None:
283 score_column = "pscore"
284 elif options.log_qvalue is not None:
285 score_column = "qscore"
286 # 4.2 peaks in narrowPeak
287 if not options.broad:
288 info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
289 ofhd_bed = open(options.peakNarrowPeak, "w")
290 peakdetect.peaks.write_to_narrowPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline)
291 ofhd_bed.close()
292 # 4.2-2 summits in BED
293 info("#4 Write summits bed file... %s" % (options.summitbed))
294 ofhd_summits = open(options.summitbed, "w")
295 peakdetect.peaks.write_to_summit_bed(ofhd_summits,
296 name_prefix="%s_peak_".encode(),
297 name=options.name.encode(),
298 description=("Summits for %s (Made with MACS v3, "
299 + strftime("%x") +
300 ")").encode(),
301 score_column=score_column,
302 trackline=options.trackline)
303 ofhd_summits.close()
304 # 4.2 broad peaks in bed12 or gappedPeak
305 else:
306 info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
307 ofhd_bed = open(options.peakBroadPeak, "w")
308 peakdetect.peaks.write_to_broadPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
309 ofhd_bed.close()
310 info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
311 ofhd_bed = open(options.peakGappedPeak, "w")
312 peakdetect.peaks.write_to_gappedPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
313 ofhd_bed.close()
315 info("Done!")
318 def cal_max_dup_tags(genome_size, tags_number, p=1e-5):
319 """Calculate the maximum duplicated tag number based on genome
320 size, total tag number and a p-value based on binomial
321 distribution. Brute force algorithm to calculate reverse CDF no
322 more than MAX_LAMBDA(100000).
325 return binomial_cdf_inv(1-p, tags_number, 1.0/genome_size)
328 def load_frag_files_options(options):
329 """From the options, load treatment fragments and control fragments (if available).
332 options.info("#1 read treatment fragments...")
334 tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
335 treat = tp.build_petrack()
336 if len(options.tfile) > 1:
337 # multiple input
338 for tfile in options.tfile[1:]:
339 tp = options.parser(tfile, buffer_size=options.buffer_size)
340 treat = tp.append_petrack(treat)
341 treat.finalize()
343 options.tsize = tp.d
344 if options.cfile:
345 options.info("#1.2 read input fragments...")
346 cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
347 control = cp.build_petrack()
348 control_d = cp.d
349 if len(options.cfile) > 1:
350 # multiple input
351 for cfile in options.cfile[1:]:
352 cp = options.parser(cfile, buffer_size=options.buffer_size)
353 control = cp.append_petrack(control)
354 control.finalize()
355 else:
356 control = None
357 options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
358 # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
359 if control is not None:
360 options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
361 return (treat, control)
364 def load_tag_files_options(options):
365 """From the options, load treatment tags and control tags (if available).
368 options.info("#1 read treatment tags...")
369 tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
370 if not options.tsize: # override tsize if user specified --tsize
371 ttsize = tp.tsize()
372 options.tsize = ttsize
373 treat = tp.build_fwtrack()
374 if len(options.tfile) > 1:
375 # multiple input
376 for tfile in options.tfile[1:]:
377 tp = options.parser(tfile, buffer_size=options.buffer_size)
378 treat = tp.append_fwtrack(treat)
379 treat.finalize()
381 if options.cfile:
382 options.info("#1.2 read input tags...")
383 control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack()
384 if len(options.cfile) > 1:
385 # multiple input
386 for cfile in options.cfile[1:]:
387 cp = options.parser(cfile, buffer_size=options.buffer_size)
388 control = cp.append_fwtrack(control)
389 control.finalize()
390 else:
391 control = None
392 options.info("#1 tag size is determined as %d bps" % options.tsize)
393 return (treat, control)