Merge pull request #378 from taoliu/fix_setup_script_364
[MACS.git] / MACS2 / callpeak_cmd.py
blob3ee167422c519ede3cafd1b30d1def94f0f0bae5
1 # Time-stamp: <2019-11-04 10:41:13 taoliu>
3 """Description: MACS 2 main executable
5 This code is free software; you can redistribute it and/or modify it
6 under the terms of the BSD License (see the file LICENSE included with
7 the distribution).
8 """
10 # ------------------------------------
11 # python modules
12 # ------------------------------------
14 import os
15 import sys
16 import logging
17 from time import strftime
18 import tempfile
20 # ------------------------------------
21 # own python modules
22 # ------------------------------------
23 from MACS2.OptValidator import opt_validate
24 from MACS2.Prob import binomial_cdf_inv
25 from MACS2.PeakModel import PeakModel,NotEnoughPairsException
26 from MACS2.PeakDetect import PeakDetect
27 from MACS2.OutputWriter import model2r_script
28 from MACS2.Constants import *
29 # ------------------------------------
30 # Main function
31 # ------------------------------------
32 def check_names(treat, control, error_stream):
33 """check common chromosome names"""
34 tchrnames = set(treat.get_chr_names())
35 cchrnames = set(control.get_chr_names())
36 commonnames = tchrnames.intersection(cchrnames)
37 if len(commonnames)==0:
38 error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
39 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
40 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
41 sys.exit()
43 def run( args ):
44 """The Main function/pipeline for MACS.
46 """
47 # Parse options...
48 options = opt_validate( args )
49 # end of parsing commandline options
50 info = options.info
51 warn = options.warn
52 debug = options.debug
53 error = options.error
54 #0 output arguments
55 info("\n"+options.argtxt)
56 options.PE_MODE = options.format in ('BAMPE','BEDPE')
57 if options.PE_MODE: tag = 'fragment' # call things fragments not tags
58 else: tag = 'tag'
60 tempfile.tempdir = options.tempdir
62 #1 Read tag files
63 info("#1 read %s files...", tag)
64 if options.PE_MODE: (treat, control) = load_frag_files_options (options)
65 else: (treat, control) = load_tag_files_options (options)
66 if control is not None: check_names(treat, control, error)
68 info("#1 %s size = %.1f", tag, options.tsize)
69 tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize)
71 t0 = treat.total
72 tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
73 info("#1 total %ss in treatment: %d", tag, t0)
74 # not ready yet
75 # options.filteringmodel = True
76 # if options.filteringmodel:
77 # treat.separate_dups()
78 # t0 = treat.total + treat.dups.total
79 # t1 = treat.total
80 # info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
81 # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
82 # elif options.keepduplicates != "all":
83 if options.keepduplicates != "all":
84 if options.keepduplicates == "auto":
85 info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
86 treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
87 info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
88 else:
89 info("#1 user defined the maximum %ss...", tag)
90 treatment_max_dup_tags = int(options.keepduplicates)
91 if options.PE_MODE:
92 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
93 else:
94 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
95 #treat.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to addbackdup+refinepeak anymore
96 treat.filter_dup(treatment_max_dup_tags)
97 t1 = treat.total
98 info("#1 %ss after filtering in treatment: %d", tag, t1)
99 tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
100 if options.PE_MODE:
101 tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
102 else:
103 tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
104 info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
105 tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
106 else:
107 t1 = t0
109 if control is not None:
110 c0 = control.total
111 tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
112 info("#1 total %ss in control: %d", tag, c0)
113 # not ready yet
114 #if options.filteringmodel:
115 # control.separate_dups()
116 # c0 = treat.total + treat.dups.total
117 # c1 = treat.total
118 # info("#1 Redundant rate of treatment: %.2f", float(c0 - c1) / c0)
119 # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0)
120 #elif options.keepduplicates != "all":
121 if options.keepduplicates != "all":
122 if options.keepduplicates == "auto":
123 info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
124 control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
125 info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags))
126 else:
127 info("#1 user defined the maximum %ss...", tag)
128 control_max_dup_tags = int(options.keepduplicates)
129 if options.PE_MODE:
130 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
131 else:
132 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
133 control.filter_dup(treatment_max_dup_tags)
134 #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
135 c1 = control.total
137 info("#1 %ss after filtering in control: %d", tag, c1)
138 tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
139 if options.PE_MODE:
140 tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
141 else:
142 tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)
144 info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0))
145 tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
146 else:
147 c1 = c0
148 info("#1 finished!")
150 #2 Build Model
151 info("#2 Build Peak Model...")
153 if options.nomodel:
154 info("#2 Skipped...")
155 if options.PE_MODE:
156 #options.shiftsize = 0
157 options.d = options.tsize
158 else:
159 options.d=options.extsize
160 info("#2 Use %d as fragment length" % (options.d))
161 if options.shift > 0:
162 info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift))
163 elif options.shift < 0:
164 info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1))
165 options.scanwindow=2*options.d # remove the effect of --bw
166 else:
167 try:
168 peakmodel = PeakModel(treatment = treat,
169 max_pairnum = MAX_PAIRNUM,
170 opt = options
172 info("#2 finished!")
173 debug("#2 Summary Model:")
174 debug("#2 min_tags: %d" % (peakmodel.min_tags))
175 debug("#2 d: %d" % (peakmodel.d))
176 debug("#2 scan_window: %d" % (peakmodel.scan_window))
177 info("#2 predicted fragment length is %d bps" % peakmodel.d)
178 info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
179 info("#2.2 Generate R script for model : %s" % (options.modelR))
180 model2r_script(peakmodel,options.modelR,options.name)
181 options.d = peakmodel.d
182 options.scanwindow= 2*options.d
183 if options.d <= 2*options.tsize:
184 warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
185 if options.onauto:
186 options.d=options.extsize
187 options.scanwindow=2*options.d
188 warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d))
189 else:
190 warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d)))
191 warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
193 except NotEnoughPairsException:
194 if not options.onauto:
195 sys.exit(1)
196 warn("#2 Skipped...")
197 options.d=options.extsize
198 options.scanwindow=2*options.d
199 warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d))
201 #3 Call Peaks
202 info("#3 Call peaks...")
203 if options.nolambda:
204 info("# local lambda is disabled!")
206 # decide options.tocontrol according to options.tolarge
207 if control and options.PE_MODE:
208 c1 = c1 * 2
210 if control:
211 if options.downsample:
212 # use random sampling to balance treatment and control
213 info("#3 User prefers to use random sampling instead of linear scaling.")
214 if t1 > c1:
215 info("#3 MACS is random sampling treatment %ss...", tag)
216 if options.seed < 0:
217 warn("#3 Your results may not be reproducible due to the random sampling!")
218 else:
219 info("#3 Random seed (%d) is used." % options.seed)
220 treat.sample_num(c1, options.seed)
221 info("#3 %d Tags from treatment are kept", treat.total)
222 elif c1 > t1:
223 info("#3 MACS is random sampling control %ss...", tag)
224 if options.seed < 0:
225 warn("#3 Your results may not be reproducible due to the random sampling!")
226 else:
227 info("#3 Random seed (%d) is used." % options.seed)
228 control.sample_num(t1, options.seed)
229 info("#3 %d %ss from control are kept", control.total, tag)
230 # set options.tocontrol although it would;t matter now
231 options.tocontrol = False
232 else:
233 if options.scaleto == "large":
234 if t1 > c1:
235 # treatment has more tags than control, since tolarge is
236 # true, we will scale control to treatment.
237 options.tocontrol = False
238 else:
239 # treatment has less tags than control, since tolarge is
240 # true, we will scale treatment to control.
241 options.tocontrol = True
242 else:
243 if t1 > c1:
244 # treatment has more tags than control, since tolarge is
245 # false, we will scale treatment to control.
246 options.tocontrol = True
247 else:
248 # treatment has less tags than control, since tolarge is
249 # false, we will scale control to treatment.
250 options.tocontrol = False
252 peakdetect = PeakDetect(treat = treat,
253 control = control,
254 opt = options
256 peakdetect.call_peaks()
258 # filter out low FE peaks
259 peakdetect.peaks.filter_fc( fc_low = options.fecutoff )
261 #call refinepeak if needed.
262 # if options.refine_peaks:
263 # info("#3 now put back duplicate reads...")
264 # treat.addback_dups()
265 # info("#3 calculate reads balance to refine peak summits...")
266 # refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 )
267 # info("#3 reassign scores for newly refined peak summits...")
268 # peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace
269 # #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" )
270 # #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w")
271 # #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline )
273 #diag_result = peakdetect.diag_result()
274 #4 output
275 #4.1 peaks in XLS
276 info("#4 Write output xls file... %s" % (options.peakxls))
277 ofhd_xls = open( options.peakxls, "w" )
278 ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
279 ofhd_xls.write(options.argtxt+"\n")
281 ofhd_xls.write(tagsinfo)
283 if options.shift > 0:
284 ofhd_xls.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift))
285 elif options.shift < 0:
286 ofhd_xls.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1))
288 ofhd_xls.write("# d = %d\n" % (options.d))
289 try:
290 ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
291 except:
292 # when --nomodel is used, there is no peakmodel object. Simply skip this line.
293 pass
294 if options.nolambda:
295 ofhd_xls.write("# local lambda is disabled!\n")
296 # pass write method so we can print too, and include name
297 peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name.encode())
298 ofhd_xls.close()
299 #4.2 peaks in BED
300 if options.log_pvalue != None:
301 score_column = "pscore"
302 elif options.log_qvalue != None:
303 score_column = "qscore"
304 #4.2 peaks in narrowPeak
305 if not options.broad:
306 #info("#4 Write peak bed file... %s" % (options.peakbed))
307 #ofhd_bed = open(options.peakbed,"w")
308 #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline)
309 #ofhd_bed.close()
310 info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
311 ofhd_bed = open( options.peakNarrowPeak, "w" )
312 peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline )
313 ofhd_bed.close()
314 #4.2-2 summits in BED
315 info("#4 Write summits bed file... %s" % (options.summitbed))
316 ofhd_summits = open( options.summitbed, "w" )
317 peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_".encode(), name=options.name.encode(),
318 description=("Summits for %s (Made with MACS v2, " + strftime("%x") + ")").encode(),
319 score_column=score_column, trackline=options.trackline )
320 ofhd_summits.close()
321 #4.2 broad peaks in bed12 or gappedPeak
322 else:
323 info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
324 ofhd_bed = open( options.peakBroadPeak, "w" )
325 peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
326 ofhd_bed.close()
327 info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
328 ofhd_bed = open( options.peakGappedPeak, "w" )
329 peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
330 ofhd_bed.close()
332 info("Done!")
334 def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
335 """Calculate the maximum duplicated tag number based on genome
336 size, total tag number and a p-value based on binomial
337 distribution. Brute force algorithm to calculate reverse CDF no
338 more than MAX_LAMBDA(100000).
341 return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)
343 def load_frag_files_options ( options ):
344 """From the options, load treatment fragments and control fragments (if available).
347 options.info("#1 read treatment fragments...")
349 tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
350 treat = tp.build_petrack()
351 #treat.sort()
352 if len(options.tfile) > 1:
353 # multiple input
354 for tfile in options.tfile[1:]:
355 tp = options.parser(tfile, buffer_size=options.buffer_size)
356 treat = tp.append_petrack( treat )
357 #treat.sort()
358 treat.finalize()
360 options.tsize = tp.d
361 if options.cfile:
362 options.info("#1.2 read input fragments...")
363 cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
364 control = cp.build_petrack()
365 control_d = cp.d
366 #control.sort()
367 if len(options.cfile) > 1:
368 # multiple input
369 for cfile in options.cfile[1:]:
370 cp = options.parser(cfile, buffer_size=options.buffer_size)
371 control = cp.append_petrack( control )
372 #control.sort()
373 control.finalize()
374 else:
375 control = None
376 options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
377 # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
378 if control is not None:
379 options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
380 return (treat, control)
382 def load_tag_files_options ( options ):
383 """From the options, load treatment tags and control tags (if available).
386 options.info("#1 read treatment tags...")
387 tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
388 if not options.tsize: # override tsize if user specified --tsize
389 ttsize = tp.tsize()
390 options.tsize = ttsize
391 treat = tp.build_fwtrack()
392 #treat.sort()
393 if len(options.tfile) > 1:
394 # multiple input
395 for tfile in options.tfile[1:]:
396 tp = options.parser(tfile, buffer_size=options.buffer_size)
397 treat = tp.append_fwtrack( treat )
398 #treat.sort()
399 treat.finalize()
401 if options.cfile:
402 options.info("#1.2 read input tags...")
403 control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack()
404 #control.sort()
405 if len(options.cfile) > 1:
406 # multiple input
407 for cfile in options.cfile[1:]:
408 cp = options.parser(cfile, buffer_size=options.buffer_size)
409 control = cp.append_fwtrack( control )
410 #control.sort()
411 control.finalize()
412 else:
413 control = None
414 options.info("#1 tag size is determined as %d bps" % options.tsize)
415 return (treat, control)