MACS3/Commands/callpeak_cmd.py

   1 # Time-stamp: <2024-10-02 15:58:36 Tao Liu>
   2
   3 """Description: MACS 3 call peak main executable
   4
   5 This code is free software; you can redistribute it and/or modify it
   6 under the terms of the BSD License (see the file LICENSE included with
   7 the distribution).
   8 """
   9
  10 # ------------------------------------
  11 # python modules
  12 # ------------------------------------
  13
  14 import sys
  15 from time import strftime
  16 import tempfile
  17
  18 # ------------------------------------
  19 # MACS3 python modules
  20 # ------------------------------------
  21 from MACS3.Utilities.Constants import MACS_VERSION, MAX_PAIRNUM
  22 from MACS3.Utilities.OptValidator import opt_validate_callpeak
  23 from MACS3.Signal.Prob import binomial_cdf_inv
  24 from MACS3.Signal.PeakModel import PeakModel, NotEnoughPairsException
  25 from MACS3.Signal.PeakDetect import PeakDetect
  26 from MACS3.IO.OutputWriter import model2r_script
  27
  28 # ------------------------------------
  29 # Main function
  30 # ------------------------------------
  31
  32
  33 def check_names(treat, control, error_stream):
  34     """check common chromosome names"""
  35     tchrnames = set(treat.get_chr_names())
  36     cchrnames = set(control.get_chr_names())
  37     commonnames = tchrnames.intersection(cchrnames)
  38     if len(commonnames) == 0:
  39         error_stream("No common chromosome names can be found from treatment and control!")
  40         error_stream("Please make sure that the treatment and control alignment files were generated by using the same genome assembly!")
  41         error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
  42         error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
  43         sys.exit()
  44
  45
  46 def run(args):
  47     """The Main function/pipeline for MACS.
  48
  49     """
  50     # Parse options...
  51     options = opt_validate_callpeak(args)
  52     # end of parsing commandline options
  53     info = options.info
  54     warn = options.warn
  55     debug = options.debug
  56     error = options.error
  57
  58     # 0 output arguments
  59     info("\n"+options.argtxt)
  60     options.PE_MODE = options.format in ('BAMPE', 'BEDPE')
  61     if options.PE_MODE:
  62         tag = 'fragment'        # call things fragments not tags
  63     else:
  64         tag = 'tag'
  65
  66     tempfile.tempdir = options.tempdir
  67
  68     # 1 Read tag files
  69     info("#1 read %s files...", tag)
  70     if options.PE_MODE:
  71         (treat, control) = load_frag_files_options(options)
  72     else:
  73         (treat, control) = load_tag_files_options(options)
  74     if control is not None:
  75         # check if chromosome names are consistent. quit if not.
  76         check_names(treat, control, error)
  77
  78     info("#1 %s size = %.1f", tag, options.tsize)
  79     tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize)
  80
  81     t0 = treat.total
  82     tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
  83     info("#1  total %ss in treatment: %d", tag, t0)
  84
  85     # handle duplicates
  86     if options.keepduplicates != "all":
  87         if options.keepduplicates == "auto":
  88             info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
  89             treatment_max_dup_tags = cal_max_dup_tags(options.gsize, t0)
  90             info("#1  max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
  91         else:
  92             info("#1 user defined the maximum %ss...", tag)
  93             treatment_max_dup_tags = int(options.keepduplicates)
  94         if options.PE_MODE:
  95             info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
  96         else:
  97             info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
  98
  99         treat.filter_dup(treatment_max_dup_tags)
 100         t1 = treat.total
 101         info("#1  %ss after filtering in treatment: %d", tag, t1)
 102         tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
 103         if options.PE_MODE:
 104             tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
 105         else:
 106             tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
 107         info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
 108         tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
 109     else:
 110         t1 = t0
 111
 112     if control is not None:
 113         c0 = control.total
 114         tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
 115         info("#1  total %ss in control: %d", tag, c0)
 116
 117         if options.keepduplicates != "all":
 118             if options.keepduplicates == "auto":
 119                 info("#1  for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
 120                 control_max_dup_tags = cal_max_dup_tags(options.gsize, c0)
 121                 info("#1  max_dup_tags based on binomial = %d" % (control_max_dup_tags))
 122             else:
 123                 info("#1 user defined the maximum %ss...", tag)
 124                 control_max_dup_tags = int(options.keepduplicates)
 125             if options.PE_MODE:
 126                 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
 127             else:
 128                 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
 129             control.filter_dup(treatment_max_dup_tags)
 130             #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
 131             c1 = control.total
 132
 133             info("#1  %ss after filtering in control: %d", tag, c1)
 134             tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
 135             if options.PE_MODE:
 136                 tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
 137             else:
 138                 tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)
 139
 140             info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
 141             tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
 142         else:
 143             c1 = c0
 144     info("#1 finished!")
 145
 146     # 2 Build Model
 147     info("#2 Build Peak Model...")
 148
 149     if options.nomodel:
 150         info("#2 Skipped...")
 151         if options.PE_MODE:
 152             options.d = options.tsize
 153         else:
 154             options.d = options.extsize
 155             info("#2 Use %d as fragment length" % (options.d))
 156         if options.shift > 0:
 157             info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % options.shift)
 158         elif options.shift < 0:
 159             info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1))
 160         options.scanwindow=2*options.d  # remove the effect of --bw
 161     else:
 162         peakmodel = PeakModel(treatment=treat,
 163                               max_pairnum=MAX_PAIRNUM,
 164                               opt=options
 165                               )
 166         try:
 167             peakmodel.build()
 168             info("#2 finished!")
 169             debug("#2  Summary Model:")
 170             debug("#2   min_tags: %d" % (peakmodel.min_tags))
 171             debug("#2   d: %d" % (peakmodel.d))
 172             debug("#2   scan_window: %d" % (peakmodel.scan_window))
 173             info("#2 predicted fragment length is %d bps" % peakmodel.d)
 174             info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str, peakmodel.alternative_d)))
 175             info("#2.2 Generate R script for model : %s" % (options.modelR))
 176             model2r_script(peakmodel, options.modelR, options.name)
 177             options.d = peakmodel.d
 178             options.scanwindow = 2*options.d
 179             if options.d <= 2*options.tsize:
 180                 warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
 181                 if options.onauto:
 182                     options.d = options.extsize
 183                     options.scanwindow = 2 * options.d
 184                     warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d))
 185                 else:
 186                     warn("#2 You may need to consider one of the other alternative d(s): %s" %  ','.join(map(str,peakmodel.alternative_d)))
 187                     warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
 188
 189         except NotEnoughPairsException:
 190             if not options.onauto:
 191                 sys.exit(1)
 192             warn("#2 Skipped...")
 193             options.d = options.extsize
 194             options.scanwindow = 2 * options.d
 195             warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d))
 196
 197     # 3 Call Peaks
 198     info("#3 Call peaks...")
 199     if options.nolambda:
 200         info("# local lambda is disabled!")
 201
 202     if control and options.PE_MODE:
 203         c1 = c1 * 2             # in PE_MODE, PE data has to be doubled since both ends will be counted for calculating background noise.
 204
 205     # decide the scaling to balance the depth between treatment and control
 206     if control:
 207         if options.downsample:
 208             # use random sampling to balance treatment and control
 209             info("#3 User prefers to use random sampling instead of linear scaling.")
 210             if t1 > c1:
 211                 info("#3 MACS is random sampling treatment %ss...", tag)
 212                 if options.seed < 0:
 213                     warn("#3 Your results may not be reproducible due to the random sampling!")
 214                 else:
 215                     info("#3 Random seed (%d) is used." % options.seed)
 216                 treat.sample_num(c1, options.seed)
 217                 info("#3 %d Tags from treatment are kept", treat.total)
 218             elif c1 > t1:
 219                 info("#3 MACS is random sampling control %ss...", tag)
 220                 if options.seed < 0:
 221                     warn("#3 Your results may not be reproducible due to the random sampling!")
 222                 else:
 223                     info("#3 Random seed (%d) is used." % options.seed)
 224                 control.sample_num(t1, options.seed)
 225                 info("#3 %d %ss from control are kept", control.total, tag)
 226             # set options.tocontrol although it would;t matter now
 227             options.tocontrol = False
 228         else:
 229             if options.scaleto == "large":
 230                 if t1 > c1:
 231                     # treatment has more tags than control, since tolarge is
 232                     # true, we will scale control to treatment.
 233                     options.tocontrol = False
 234                 else:
 235                     # treatment has less tags than control, since tolarge is
 236                     # true, we will scale treatment to control.
 237                     options.tocontrol = True
 238             else:
 239                 if t1 > c1:
 240                     # treatment has more tags than control, since tolarge is
 241                     # false, we will scale treatment to control.
 242                     options.tocontrol = True
 243                 else:
 244                     # treatment has less tags than control, since tolarge is
 245                     # false, we will scale control to treatment.
 246                     options.tocontrol = False
 247
 248     peakdetect = PeakDetect(treat=treat,
 249                             control=control,
 250                             opt=options
 251                             )
 252     peakdetect.call_peaks()
 253
 254     # filter out low FE peaks
 255     peakdetect.peaks.filter_fc(fc_low=options.fecutoff)
 256
 257     #4 output
 258     #4.1 peaks in XLS
 259     info("#4 Write output xls file... %s" % (options.peakxls))
 260     ofhd_xls = open(options.peakxls, "w")
 261     ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
 262     ofhd_xls.write(options.argtxt+"\n")
 263     ofhd_xls.write(tagsinfo)
 264     if options.shift > 0:
 265         ofhd_xls.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift))
 266     elif options.shift < 0:
 267         ofhd_xls.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1))
 268
 269     ofhd_xls.write("# d = %d\n" % (options.d))
 270     try:
 271         ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
 272     except:
 273         # when --nomodel is used, there is no peakmodel object. Simply skip this line.
 274         pass
 275     if options.nolambda:
 276         ofhd_xls.write("# local lambda is disabled!\n")
 277     # pass write method so we can print too, and include name
 278     peakdetect.peaks.write_to_xls(ofhd_xls, name=options.name.encode())
 279     ofhd_xls.close()
 280
 281     #4.2 peaks in BED
 282     if options.log_pvalue is not None:
 283         score_column = "pscore"
 284     elif options.log_qvalue is not None:
 285         score_column = "qscore"
 286     # 4.2 peaks in narrowPeak
 287     if not options.broad:
 288         info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
 289         ofhd_bed = open(options.peakNarrowPeak, "w")
 290         peakdetect.peaks.write_to_narrowPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline)
 291         ofhd_bed.close()
 292         # 4.2-2 summits in BED
 293         info("#4 Write summits bed file... %s" % (options.summitbed))
 294         ofhd_summits = open(options.summitbed, "w")
 295         peakdetect.peaks.write_to_summit_bed(ofhd_summits,
 296                                              name_prefix="%s_peak_".encode(),
 297                                              name=options.name.encode(),
 298                                              description=("Summits for %s (Made with MACS v3, "
 299                                                           + strftime("%x") +
 300                                                           ")").encode(),
 301                                              score_column=score_column,
 302                                              trackline=options.trackline)
 303         ofhd_summits.close()
 304     # 4.2 broad peaks in bed12 or gappedPeak
 305     else:
 306         info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
 307         ofhd_bed = open(options.peakBroadPeak, "w")
 308         peakdetect.peaks.write_to_broadPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
 309         ofhd_bed.close()
 310         info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
 311         ofhd_bed = open(options.peakGappedPeak, "w")
 312         peakdetect.peaks.write_to_gappedPeak(ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
 313         ofhd_bed.close()
 314
 315     info("Done!")
 316
 317
 318 def cal_max_dup_tags(genome_size, tags_number, p=1e-5):
 319     """Calculate the maximum duplicated tag number based on genome
 320     size, total tag number and a p-value based on binomial
 321     distribution. Brute force algorithm to calculate reverse CDF no
 322     more than MAX_LAMBDA(100000).
 323
 324     """
 325     return binomial_cdf_inv(1-p, tags_number, 1.0/genome_size)
 326
 327
 328 def load_frag_files_options(options):
 329     """From the options, load treatment fragments and control fragments (if available).
 330
 331     """
 332     options.info("#1 read treatment fragments...")
 333
 334     tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
 335     treat = tp.build_petrack()
 336     if len(options.tfile) > 1:
 337         # multiple input
 338         for tfile in options.tfile[1:]:
 339             tp = options.parser(tfile, buffer_size=options.buffer_size)
 340             treat = tp.append_petrack(treat)
 341     treat.finalize()
 342
 343     options.tsize = tp.d
 344     if options.cfile:
 345         options.info("#1.2 read input fragments...")
 346         cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
 347         control = cp.build_petrack()
 348         control_d = cp.d
 349         if len(options.cfile) > 1:
 350             # multiple input
 351             for cfile in options.cfile[1:]:
 352                 cp = options.parser(cfile, buffer_size=options.buffer_size)
 353                 control = cp.append_petrack(control)
 354         control.finalize()
 355     else:
 356         control = None
 357     options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
 358     # options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
 359     if control is not None:
 360         options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
 361     return (treat, control)
 362
 363
 364 def load_tag_files_options(options):
 365     """From the options, load treatment tags and control tags (if available).
 366
 367     """
 368     options.info("#1 read treatment tags...")
 369     tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
 370     if not options.tsize:           # override tsize if user specified --tsize
 371         ttsize = tp.tsize()
 372         options.tsize = ttsize
 373     treat = tp.build_fwtrack()
 374     if len(options.tfile) > 1:
 375         # multiple input
 376         for tfile in options.tfile[1:]:
 377             tp = options.parser(tfile, buffer_size=options.buffer_size)
 378             treat = tp.append_fwtrack(treat)
 379     treat.finalize()
 380
 381     if options.cfile:
 382         options.info("#1.2 read input tags...")
 383         control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack()
 384         if len(options.cfile) > 1:
 385             # multiple input
 386             for cfile in options.cfile[1:]:
 387                 cp = options.parser(cfile, buffer_size=options.buffer_size)
 388                 control = cp.append_fwtrack(control)
 389         control.finalize()
 390     else:
 391         control = None
 392     options.info("#1 tag size is determined as %d bps" % options.tsize)
 393     return (treat, control)