MACS2/callpeak_cmd.py

   1 # Time-stamp: <2019-11-04 10:41:13 taoliu>
   2
   3 """Description: MACS 2 main executable
   4
   5 This code is free software; you can redistribute it and/or modify it
   6 under the terms of the BSD License (see the file LICENSE included with
   7 the distribution).
   8 """
   9
  10 # ------------------------------------
  11 # python modules
  12 # ------------------------------------
  13
  14 import os
  15 import sys
  16 import logging
  17 from time import strftime
  18 import tempfile
  19
  20 # ------------------------------------
  21 # own python modules
  22 # ------------------------------------
  23 from MACS2.OptValidator import opt_validate
  24 from MACS2.Prob import binomial_cdf_inv
  25 from MACS2.PeakModel import PeakModel,NotEnoughPairsException
  26 from MACS2.PeakDetect import PeakDetect
  27 from MACS2.OutputWriter import model2r_script
  28 from MACS2.Constants import *
  29 # ------------------------------------
  30 # Main function
  31 # ------------------------------------
  32 def check_names(treat, control, error_stream):
  33     """check common chromosome names"""
  34     tchrnames = set(treat.get_chr_names())
  35     cchrnames = set(control.get_chr_names())
  36     commonnames = tchrnames.intersection(cchrnames)
  37     if len(commonnames)==0:
  38         error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
  39         error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
  40         error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
  41         sys.exit()
  42
  43 def run( args ):
  44     """The Main function/pipeline for MACS.
  45
  46     """
  47     # Parse options...
  48     options = opt_validate( args )
  49     # end of parsing commandline options
  50     info = options.info
  51     warn = options.warn
  52     debug = options.debug
  53     error = options.error
  54     #0 output arguments
  55     info("\n"+options.argtxt)
  56     options.PE_MODE = options.format in ('BAMPE','BEDPE')
  57     if options.PE_MODE: tag = 'fragment' # call things fragments not tags
  58     else: tag = 'tag'
  59
  60     tempfile.tempdir = options.tempdir
  61
  62     #1 Read tag files
  63     info("#1 read %s files...", tag)
  64     if options.PE_MODE: (treat, control) = load_frag_files_options (options)
  65     else:       (treat, control) = load_tag_files_options  (options)
  66     if control is not None: check_names(treat, control, error)
  67
  68     info("#1 %s size = %.1f", tag, options.tsize)
  69     tagsinfo  = "# %s size is determined as %d bps\n" % (tag, options.tsize)
  70
  71     t0 = treat.total
  72     tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0)
  73     info("#1  total %ss in treatment: %d", tag, t0)
  74     # not ready yet
  75 #    options.filteringmodel = True
  76 #    if options.filteringmodel:
  77 #        treat.separate_dups()
  78 #        t0 = treat.total + treat.dups.total
  79 #        t1 = treat.total
  80 #        info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
  81 #        tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
  82 #    elif options.keepduplicates != "all":
  83     if options.keepduplicates != "all":
  84         if options.keepduplicates == "auto":
  85             info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag)
  86             treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0)
  87             info("#1  max_dup_tags based on binomial = %d" % (treatment_max_dup_tags))
  88         else:
  89             info("#1 user defined the maximum %ss...", tag)
  90             treatment_max_dup_tags = int(options.keepduplicates)
  91         if options.PE_MODE:
  92             info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
  93         else:
  94             info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
  95         #treat.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to addbackdup+refinepeak anymore
  96         treat.filter_dup(treatment_max_dup_tags)
  97         t1 = treat.total
  98         info("#1  %ss after filtering in treatment: %d", tag, t1)
  99         tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1)
 100         if options.PE_MODE:
 101             tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags)
 102         else:
 103             tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags)
 104         info("#1  Redundant rate of treatment: %.2f", float(t0 - t1) / t0)
 105         tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0)
 106     else:
 107         t1 = t0
 108
 109     if control is not None:
 110         c0 = control.total
 111         tagsinfo += "# total %ss in control: %d\n" % (tag, c0)
 112         info("#1  total %ss in control: %d", tag, c0)
 113         # not ready yet
 114         #if options.filteringmodel:
 115         #    control.separate_dups()
 116         #    c0 = treat.total + treat.dups.total
 117         #    c1 = treat.total
 118         #    info("#1  Redundant rate of treatment: %.2f", float(c0 - c1) / c0)
 119         #    tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0)
 120         #elif options.keepduplicates != "all":
 121         if options.keepduplicates != "all":
 122             if options.keepduplicates == "auto":
 123                 info("#1  for control, calculate max duplicate %ss in single position based on binomial distribution...", tag)
 124                 control_max_dup_tags = cal_max_dup_tags(options.gsize,c0)
 125                 info("#1  max_dup_tags based on binomial = %d" % (control_max_dup_tags))
 126             else:
 127                 info("#1 user defined the maximum %ss...", tag)
 128                 control_max_dup_tags = int(options.keepduplicates)
 129             if options.PE_MODE:
 130                 info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags)
 131             else:
 132                 info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags)
 133             control.filter_dup(treatment_max_dup_tags)
 134             #control.separate_dups(treatment_max_dup_tags) # changed 5-29; changed back since we don't need to call addbackdup+refinepeak anymore
 135             c1 = control.total
 136
 137             info("#1  %ss after filtering in control: %d", tag, c1)
 138             tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1)
 139             if options.PE_MODE:
 140                 tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags)
 141             else:
 142                 tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags)
 143
 144             info("#1  Redundant rate of control: %.2f" % (float(c0-c1)/c0))
 145             tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0)
 146         else:
 147             c1 = c0
 148     info("#1 finished!")
 149
 150     #2 Build Model
 151     info("#2 Build Peak Model...")
 152
 153     if options.nomodel:
 154         info("#2 Skipped...")
 155         if options.PE_MODE:
 156             #options.shiftsize = 0
 157             options.d = options.tsize
 158         else:
 159             options.d=options.extsize
 160             info("#2 Use %d as fragment length" % (options.d))
 161         if options.shift > 0:
 162             info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift))
 163         elif options.shift < 0:
 164             info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1))
 165         options.scanwindow=2*options.d  # remove the effect of --bw
 166     else:
 167         try:
 168             peakmodel = PeakModel(treatment = treat,
 169                                   max_pairnum = MAX_PAIRNUM,
 170                                   opt = options
 171                                   )
 172             info("#2 finished!")
 173             debug("#2  Summary Model:")
 174             debug("#2   min_tags: %d" % (peakmodel.min_tags))
 175             debug("#2   d: %d" % (peakmodel.d))
 176             debug("#2   scan_window: %d" % (peakmodel.scan_window))
 177             info("#2 predicted fragment length is %d bps" % peakmodel.d)
 178             info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d)))
 179             info("#2.2 Generate R script for model : %s" % (options.modelR))
 180             model2r_script(peakmodel,options.modelR,options.name)
 181             options.d = peakmodel.d
 182             options.scanwindow= 2*options.d
 183             if options.d <= 2*options.tsize:
 184                 warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d))
 185                 if options.onauto:
 186                     options.d=options.extsize
 187                     options.scanwindow=2*options.d
 188                     warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d))
 189                 else:
 190                     warn("#2 You may need to consider one of the other alternative d(s): %s" %  ','.join(map(str,peakmodel.alternative_d)))
 191                     warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.")
 192
 193         except NotEnoughPairsException:
 194             if not options.onauto:
 195                 sys.exit(1)
 196             warn("#2 Skipped...")
 197             options.d=options.extsize
 198             options.scanwindow=2*options.d
 199             warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d))
 200
 201     #3 Call Peaks
 202     info("#3 Call peaks...")
 203     if options.nolambda:
 204         info("# local lambda is disabled!")
 205
 206     # decide options.tocontrol according to options.tolarge
 207     if control and options.PE_MODE:
 208         c1 = c1 * 2
 209
 210     if control:
 211         if options.downsample:
 212             # use random sampling to balance treatment and control
 213             info("#3 User prefers to use random sampling instead of linear scaling.")
 214             if t1 > c1:
 215                 info("#3 MACS is random sampling treatment %ss...", tag)
 216                 if options.seed < 0:
 217                     warn("#3 Your results may not be reproducible due to the random sampling!")
 218                 else:
 219                     info("#3 Random seed (%d) is used." % options.seed)
 220                 treat.sample_num(c1, options.seed)
 221                 info("#3 %d Tags from treatment are kept", treat.total)
 222             elif c1 > t1:
 223                 info("#3 MACS is random sampling control %ss...", tag)
 224                 if options.seed < 0:
 225                     warn("#3 Your results may not be reproducible due to the random sampling!")
 226                 else:
 227                     info("#3 Random seed (%d) is used." % options.seed)
 228                 control.sample_num(t1, options.seed)
 229                 info("#3 %d %ss from control are kept", control.total, tag)
 230             # set options.tocontrol although it would;t matter now
 231             options.tocontrol = False
 232         else:
 233             if options.scaleto == "large":
 234                 if t1 > c1:
 235                     # treatment has more tags than control, since tolarge is
 236                     # true, we will scale control to treatment.
 237                     options.tocontrol = False
 238                 else:
 239                     # treatment has less tags than control, since tolarge is
 240                     # true, we will scale treatment to control.
 241                     options.tocontrol = True
 242             else:
 243                 if t1 > c1:
 244                     # treatment has more tags than control, since tolarge is
 245                     # false, we will scale treatment to control.
 246                     options.tocontrol = True
 247                 else:
 248                     # treatment has less tags than control, since tolarge is
 249                     # false, we will scale control to treatment.
 250                     options.tocontrol = False
 251
 252     peakdetect = PeakDetect(treat = treat,
 253                             control = control,
 254                             opt = options
 255                             )
 256     peakdetect.call_peaks()
 257
 258     # filter out low FE peaks
 259     peakdetect.peaks.filter_fc( fc_low = options.fecutoff )
 260
 261     #call refinepeak if needed.
 262     # if options.refine_peaks:
 263     #     info("#3 now put back duplicate reads...")
 264     #     treat.addback_dups()
 265     #     info("#3 calculate reads balance to refine peak summits...")
 266     #     refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 )
 267     #     info("#3 reassign scores for newly refined peak summits...")
 268     #     peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace
 269     #     #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" )
 270     #     #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w")
 271     #     #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline )
 272
 273     #diag_result = peakdetect.diag_result()
 274     #4 output
 275     #4.1 peaks in XLS
 276     info("#4 Write output xls file... %s" % (options.peakxls))
 277     ofhd_xls = open( options.peakxls, "w" )
 278     ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION))
 279     ofhd_xls.write(options.argtxt+"\n")
 280
 281     ofhd_xls.write(tagsinfo)
 282
 283     if options.shift > 0:
 284         ofhd_xls.write("# Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift))
 285     elif options.shift < 0:
 286         ofhd_xls.write("# Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1))
 287
 288     ofhd_xls.write("# d = %d\n" % (options.d))
 289     try:
 290         ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d)))
 291     except:
 292         # when --nomodel is used, there is no peakmodel object. Simply skip this line.
 293         pass
 294     if options.nolambda:
 295         ofhd_xls.write("# local lambda is disabled!\n")
 296     # pass write method so we can print too, and include name
 297     peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name.encode())
 298     ofhd_xls.close()
 299     #4.2 peaks in BED
 300     if options.log_pvalue != None:
 301         score_column = "pscore"
 302     elif options.log_qvalue != None:
 303         score_column = "qscore"
 304     #4.2 peaks in narrowPeak
 305     if not options.broad:
 306         #info("#4 Write peak bed file... %s" % (options.peakbed))
 307         #ofhd_bed = open(options.peakbed,"w")
 308         #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline)
 309         #ofhd_bed.close()
 310         info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak))
 311         ofhd_bed = open( options.peakNarrowPeak, "w" )
 312         peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), score_column=score_column, trackline=options.trackline )
 313         ofhd_bed.close()
 314         #4.2-2 summits in BED
 315         info("#4 Write summits bed file... %s" % (options.summitbed))
 316         ofhd_summits = open( options.summitbed, "w" )
 317         peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_".encode(), name=options.name.encode(),
 318                                               description=("Summits for %s (Made with MACS v2, " + strftime("%x") + ")").encode(),
 319                                               score_column=score_column, trackline=options.trackline )
 320         ofhd_summits.close()
 321     #4.2 broad peaks in bed12 or gappedPeak
 322     else:
 323         info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak))
 324         ofhd_bed = open( options.peakBroadPeak, "w" )
 325         peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
 326         ofhd_bed.close()
 327         info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak))
 328         ofhd_bed = open( options.peakGappedPeak, "w" )
 329         peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix=b"%s_peak_", name=options.name.encode(), description=options.name.encode(), score_column=score_column, trackline=options.trackline)
 330         ofhd_bed.close()
 331
 332     info("Done!")
 333
 334 def cal_max_dup_tags ( genome_size, tags_number, p=1e-5 ):
 335     """Calculate the maximum duplicated tag number based on genome
 336     size, total tag number and a p-value based on binomial
 337     distribution. Brute force algorithm to calculate reverse CDF no
 338     more than MAX_LAMBDA(100000).
 339
 340     """
 341     return binomial_cdf_inv(1-p,tags_number,1.0/genome_size)
 342
 343 def load_frag_files_options ( options ):
 344     """From the options, load treatment fragments and control fragments (if available).
 345
 346     """
 347     options.info("#1 read treatment fragments...")
 348
 349     tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
 350     treat = tp.build_petrack()
 351     #treat.sort()
 352     if len(options.tfile) > 1:
 353         # multiple input
 354         for tfile in options.tfile[1:]:
 355             tp = options.parser(tfile, buffer_size=options.buffer_size)
 356             treat = tp.append_petrack( treat )
 357             #treat.sort()
 358     treat.finalize()
 359
 360     options.tsize = tp.d
 361     if options.cfile:
 362         options.info("#1.2 read input fragments...")
 363         cp = options.parser(options.cfile[0], buffer_size=options.buffer_size)
 364         control = cp.build_petrack()
 365         control_d = cp.d
 366         #control.sort()
 367         if len(options.cfile) > 1:
 368             # multiple input
 369             for cfile in options.cfile[1:]:
 370                 cp = options.parser(cfile, buffer_size=options.buffer_size)
 371                 control = cp.append_petrack( control )
 372                 #control.sort()
 373         control.finalize()
 374     else:
 375         control = None
 376     options.info("#1 mean fragment size is determined as %.1f bp from treatment" % options.tsize)
 377 #    options.info("#1 fragment size variance is determined as %d bp from treatment" % tp.variance)
 378     if control is not None:
 379         options.info("#1 note: mean fragment size in control is %.1f bp -- value ignored" % control_d)
 380     return (treat, control)
 381
 382 def load_tag_files_options ( options ):
 383     """From the options, load treatment tags and control tags (if available).
 384
 385     """
 386     options.info("#1 read treatment tags...")
 387     tp = options.parser(options.tfile[0], buffer_size=options.buffer_size)
 388     if not options.tsize:           # override tsize if user specified --tsize
 389         ttsize = tp.tsize()
 390         options.tsize = ttsize
 391     treat = tp.build_fwtrack()
 392     #treat.sort()
 393     if len(options.tfile) > 1:
 394         # multiple input
 395         for tfile in options.tfile[1:]:
 396             tp = options.parser(tfile, buffer_size=options.buffer_size)
 397             treat = tp.append_fwtrack( treat )
 398             #treat.sort()
 399     treat.finalize()
 400
 401     if options.cfile:
 402         options.info("#1.2 read input tags...")
 403         control = options.parser(options.cfile[0], buffer_size=options.buffer_size).build_fwtrack()
 404         #control.sort()
 405         if len(options.cfile) > 1:
 406             # multiple input
 407             for cfile in options.cfile[1:]:
 408                 cp = options.parser(cfile, buffer_size=options.buffer_size)
 409                 control = cp.append_fwtrack( control )
 410                 #control.sort()
 411         control.finalize()
 412     else:
 413         control = None
 414     options.info("#1 tag size is determined as %d bps" % options.tsize)
 415     return (treat, control)