MACS2/PeakDetect.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2019-10-30 16:33:34 taoliu>
   4
   5 """Module Description: Detect peaks, main module
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file LICENSE included with
   9 the distribution).
  10 """
  11
  12 from itertools import groupby
  13 from operator import itemgetter
  14 import io
  15 import gc                               # use garbage collectior
  16
  17 from MACS2.IO.PeakIO import PeakIO
  18 from MACS2.IO.BedGraphIO import bedGraphIO
  19 from MACS2.Constants import *
  20 from MACS2.IO.CallPeakUnit import CallerFromAlignments
  21
  22 cdef bytes subpeak_letters(short i):
  23     if i < 26:
  24         return chr(97+i).encode()
  25     else:
  26         return subpeak_letters(i // 26) + chr(97 + (i % 26)).encode()
  27
  28 class PeakDetect:
  29     """Class to do the peak calling.
  30
  31     e.g
  32     >>> from MACS2.cPeakDetect import cPeakDetect
  33     >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000)
  34     >>> pd.call_peaks()
  35     """
  36     def __init__ (self,opt = None,treat = None, control = None, d = None,
  37                   maxgap = None, minlen = None, slocal = None, llocal = None):
  38         """Initialize the PeakDetect object.
  39
  40         """
  41         self.opt = opt
  42         self.info = opt.info
  43         self.debug = opt.debug
  44         self.warn = opt.warn
  45
  46         self.treat = treat
  47         self.control = control
  48         self.ratio_treat2control = None
  49         self.peaks = None
  50         self.final_peaks = None
  51         self.PE_MODE = opt.PE_MODE
  52         self.scoretrack = None
  53
  54         #self.femax = opt.femax
  55         #self.femin = opt.femin
  56         #self.festep = opt.festep
  57
  58         self.log_pvalue = opt.log_pvalue    # -log10pvalue
  59         self.log_qvalue = opt.log_qvalue    # -log10qvalue
  60         if d != None:
  61             self.d = d
  62         else:
  63             self.d = self.opt.d
  64
  65         if opt.maxgap:
  66             self.maxgap = opt.maxgap
  67         else:
  68             self.maxgap = opt.tsize
  69
  70         if opt.minlen:
  71             self.minlen = opt.minlen
  72         else:
  73             self.minlen = self.d
  74
  75         self.end_shift = self.opt.shift
  76         self.gsize = opt.gsize
  77
  78         self.nolambda = opt.nolambda
  79
  80         if slocal != None:
  81             self.sregion = slocal
  82         else:
  83             self.sregion = opt.smalllocal
  84
  85         if llocal != None:
  86             self.lregion = llocal
  87         else:
  88             self.lregion = opt.largelocal
  89
  90         if (self.nolambda):
  91             self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!")
  92         #self.diag = opt.diag
  93         #self.save_score = opt.store_score
  94         #self.zwig_tr = opt.zwig_tr
  95         #self.zwig_ctl= opt.zwig_ctl
  96
  97     def call_peaks (self):
  98         """Call peaks function.
  99
 100         Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
 101         self.final_peaks and self.final_negative_peaks.
 102         """
 103         if self.control:                # w/ control
 104             #if self.opt.broad:
 105             #    (self.peaks,self.broadpeaks) = self.__call_peaks_w_control()
 106             #else:
 107             self.peaks = self.__call_peaks_w_control ()
 108         else:                           # w/o control
 109             #if self.opt.broad:
 110             #    (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control()
 111             #else:
 112             self.peaks = self.__call_peaks_wo_control ()
 113         return self.peaks
 114
 115     def __call_peaks_w_control (self):
 116         """To call peaks with control data.
 117
 118         A peak info type is a: dictionary
 119
 120         key value: chromosome
 121
 122         items: (peak start,peak end, peak length, peak summit, peak
 123         height, number of tags in peak region, peak pvalue, peak
 124         fold_enrichment) <-- tuple type
 125
 126         While calculating pvalue:
 127
 128         First, t and c will be adjusted by the ratio between total
 129         reads in treatment and total reads in control, depending on
 130         --to-small option.
 131
 132         Then, t and c will be multiplied by the smallest peak size --
 133         self.d.
 134
 135         Finally, a poisson CDF is applied to calculate one-side pvalue
 136         for enrichment.
 137         """
 138         cdef:
 139             int i
 140             float lambda_bg, effective_depth_in_million
 141             float treat_scale, d
 142             list ctrl_scale_s, ctrl_d_s
 143             long treat_total, control_total
 144             long treat_sum              # approx sum of treatment pileup values
 145             long control_sum            # approx sum of control pileup values
 146
 147         treat_total   = self.treat.total
 148
 149         if self.PE_MODE:
 150             d = self.treat.average_template_length
 151             control_total = self.control.total * 2 # in PE mode, entire fragment is counted as 1
 152                                                    # in treatment whereas both ends of fragment are counted in control/input.
 153             treat_sum = self.treat.length
 154             control_sum = control_total * self.treat.average_template_length
 155             self.ratio_treat2control = float(treat_sum)/control_sum
 156         else:
 157             d = self.d
 158             control_total = self.control.total
 159             treat_sum = self.treat.total * self.d
 160             control_sum = self.control.total * self.d
 161             self.ratio_treat2control = float(treat_sum)/control_sum
 162
 163         if self.opt.ratio != 1.0:
 164             self.ratio_treat2control = self.opt.ratio
 165
 166         if self.opt.tocontrol:
 167             # if MACS decides to scale treatment to control data because treatment is bigger
 168             effective_depth_in_million = control_total / 1000000.0
 169             lambda_bg = float( control_sum )/ self.gsize
 170             treat_scale = 1/self.ratio_treat2control
 171         else:
 172             # if MACS decides to scale control to treatment because control sample is bigger
 173             effective_depth_in_million = treat_total / 1000000.0
 174             lambda_bg = float( treat_sum )/ self.gsize
 175             treat_scale = 1.0
 176
 177         # prepare d_s for control data
 178         if self.sregion:
 179             assert self.d <= self.sregion, "slocal can't be smaller than d!"
 180         if self.lregion:
 181             assert self.d <= self.lregion , "llocal can't be smaller than d!"
 182             assert self.sregion <= self.lregion , "llocal can't be smaller than slocal!"
 183
 184         # Now prepare a list of extension sizes
 185         ctrl_d_s = [ self.d ]   # note, d doesn't make sense in PE mode.
 186         # And a list of scaling factors for control
 187         ctrl_scale_s = []
 188
 189         # d
 190         if not self.opt.tocontrol:
 191             # if user wants to scale everything to ChIP data
 192             tmp_v = self.ratio_treat2control
 193         else:
 194             tmp_v = 1.0
 195         ctrl_scale_s.append( tmp_v )
 196
 197         # slocal size local
 198         if self.sregion:
 199             ctrl_d_s.append( self.sregion )
 200             if not self.opt.tocontrol:
 201                 # if user want to scale everything to ChIP data
 202                 tmp_v = float(self.d)/self.sregion*self.ratio_treat2control
 203             else:
 204                 tmp_v = float(self.d)/self.sregion
 205             ctrl_scale_s.append( tmp_v )
 206
 207         # llocal size local
 208         if self.lregion and self.lregion > self.sregion:
 209             ctrl_d_s.append( self.lregion )
 210             if not self.opt.tocontrol:
 211                 # if user want to scale everything to ChIP data
 212                 tmp_v = float(self.d)/self.lregion*self.ratio_treat2control
 213             else:
 214                 tmp_v = float(self.d)/self.lregion
 215             ctrl_scale_s.append( tmp_v )
 216
 217         #if self.PE_MODE:        # first d/scale are useless in PE mode
 218         #    ctrl_d_s = ctrl_d_s[1:]
 219         #    ctrl_scale_s = ctrl_scale_s[1:]
 220         #    print ctrl_d_s
 221         #    print ctrl_scale_s
 222         if self.nolambda:
 223             ctrl_d_s = []
 224             ctrl_scale_s = []
 225
 226         scorecalculator = CallerFromAlignments( self.treat, self.control,
 227                                                 d = d, ctrl_d_s = ctrl_d_s,
 228                                                 treat_scaling_factor = treat_scale,
 229                                                 ctrl_scaling_factor_s = ctrl_scale_s,
 230                                                 end_shift = self.end_shift,
 231                                                 lambda_bg = lambda_bg,
 232                                                 save_bedGraph = self.opt.store_bdg,
 233                                                 bedGraph_filename_prefix = self.opt.name,
 234                                                 bedGraph_treat_filename = self.opt.bdg_treat,
 235                                                 bedGraph_control_filename = self.opt.bdg_control,
 236                                                 save_SPMR = self.opt.do_SPMR,
 237                                                 cutoff_analysis_filename = self.opt.cutoff_analysis_file )
 238
 239         if self.opt.trackline: scorecalculator.enable_trackline()
 240
 241         # call peaks
 242         call_summits = self.opt.call_summits
 243         if call_summits: self.info("#3 Going to call summits inside each peak ...")
 244
 245         if self.log_pvalue != None:
 246             if self.opt.broad:
 247                 self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) )
 248                 peaks = scorecalculator.call_broadpeaks(['p',],
 249                                                     lvl1_cutoff_s=[self.log_pvalue,],
 250                                                     lvl2_cutoff_s=[self.opt.log_broadcutoff,],
 251                                                     min_length=self.minlen,
 252                                                     lvl1_max_gap=self.maxgap,
 253                                                     lvl2_max_gap=self.maxgap*4,
 254                                                     cutoff_analysis=self.opt.cutoff_analysis )
 255             else:
 256                 self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
 257                 peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
 258                                                     min_length=self.minlen,
 259                                                     max_gap=self.maxgap,
 260                                                     call_summits=call_summits,
 261                                                     cutoff_analysis=self.opt.cutoff_analysis )
 262         elif self.log_qvalue != None:
 263             if self.opt.broad:
 264                 self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
 265                 peaks = scorecalculator.call_broadpeaks(['q',],
 266                                                     lvl1_cutoff_s=[self.log_qvalue,],
 267                                                     lvl2_cutoff_s=[self.opt.log_broadcutoff,],
 268                                                     min_length=self.minlen,
 269                                                     lvl1_max_gap=self.maxgap,
 270                                                     lvl2_max_gap=self.maxgap*4,
 271                                                     cutoff_analysis=self.opt.cutoff_analysis )
 272             else:
 273                 peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
 274                                                     min_length=self.minlen,
 275                                                     max_gap=self.maxgap,
 276                                                     call_summits=call_summits,
 277                                                     cutoff_analysis=self.opt.cutoff_analysis )
 278         scorecalculator.destroy()
 279         return peaks
 280
 281     def __call_peaks_wo_control (self):
 282         """To call peaks without control data.
 283
 284         A peak info type is a: dictionary
 285
 286         key value: chromosome
 287
 288         items: (peak start,peak end, peak length, peak summit, peak
 289         height, number of tags in peak region, peak pvalue, peak
 290         fold_enrichment) <-- tuple type
 291
 292         While calculating pvalue:
 293
 294         First, t and c will be adjusted by the ratio between total
 295         reads in treatment and total reads in control, depending on
 296         --to-small option.
 297
 298         Then, t and c will be multiplied by the smallest peak size --
 299         self.d.
 300
 301         Finally, a poisson CDF is applied to calculate one-side pvalue
 302         for enrichment.
 303         """
 304         cdef float lambda_bg, effective_depth_in_million
 305         cdef float treat_scale = 1
 306         cdef float d
 307         cdef list ctrl_scale_s, ctrl_d_s
 308
 309         if self.PE_MODE: d = 0
 310         else: d = self.d
 311         treat_length = self.treat.length
 312         treat_total = self.treat.total
 313
 314         effective_depth_in_million = treat_total / 1000000.0
 315
 316         # global lambda
 317         if self.PE_MODE:
 318         #    # this an estimator, we should maybe test it for accuracy?
 319             lambda_bg = treat_length / self.gsize
 320         else:
 321             lambda_bg = float(d) * treat_total / self.gsize
 322         treat_scale = 1.0
 323
 324         # slocal and d-size local bias are not calculated!
 325         # nothing done here. should this match w control??
 326
 327         if not self.nolambda:
 328             if self.PE_MODE:
 329                 ctrl_scale_s = [ float(treat_length) / (self.lregion*treat_total*2), ]
 330             else:
 331                 ctrl_scale_s = [ float(self.d) / self.lregion, ]
 332             ctrl_d_s     = [ self.lregion, ]
 333         else:
 334             ctrl_scale_s = []
 335             ctrl_d_s     = []
 336
 337         scorecalculator = CallerFromAlignments( self.treat, None,
 338                                                 d = d, ctrl_d_s = ctrl_d_s,
 339                                                 treat_scaling_factor = treat_scale,
 340                                                 ctrl_scaling_factor_s = ctrl_scale_s,
 341                                                 end_shift = self.end_shift,
 342                                                 lambda_bg = lambda_bg,
 343                                                 save_bedGraph = self.opt.store_bdg,
 344                                                 bedGraph_filename_prefix = self.opt.name,
 345                                                 bedGraph_treat_filename = self.opt.bdg_treat,
 346                                                 bedGraph_control_filename = self.opt.bdg_control,
 347                                                 save_SPMR = self.opt.do_SPMR,
 348                                                 cutoff_analysis_filename = self.opt.cutoff_analysis_file )
 349
 350         if self.opt.trackline: scorecalculator.enable_trackline()
 351
 352         # call peaks
 353         call_summits = self.opt.call_summits
 354         if call_summits: self.info("#3 Going to call summits inside each peak ...")
 355
 356         if self.log_pvalue != None:
 357             if self.opt.broad:
 358                 self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) )
 359                 peaks = scorecalculator.call_broadpeaks(['p',],
 360                                                     lvl1_cutoff_s=[self.log_pvalue,],
 361                                                     lvl2_cutoff_s=[self.opt.log_broadcutoff,],
 362                                                     min_length=self.minlen,
 363                                                     lvl1_max_gap=self.maxgap,
 364                                                     lvl2_max_gap=self.maxgap*4,
 365                                                     cutoff_analysis=self.opt.cutoff_analysis )
 366             else:
 367                 self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
 368                 peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
 369                                                     min_length=self.minlen,
 370                                                     max_gap=self.maxgap,
 371                                                     call_summits=call_summits,
 372                                                     cutoff_analysis=self.opt.cutoff_analysis )
 373         elif self.log_qvalue != None:
 374             if self.opt.broad:
 375                 self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
 376                 peaks = scorecalculator.call_broadpeaks(['q',],
 377                                                     lvl1_cutoff_s=[self.log_qvalue,],
 378                                                     lvl2_cutoff_s=[self.opt.log_broadcutoff,],
 379                                                     min_length=self.minlen,
 380                                                     lvl1_max_gap=self.maxgap,
 381                                                     lvl2_max_gap=self.maxgap*4,
 382                                                     cutoff_analysis=self.opt.cutoff_analysis )
 383             else:
 384                 peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
 385                                                     min_length=self.minlen,
 386                                                     max_gap=self.maxgap,
 387                                                     call_summits=call_summits,
 388                                                     cutoff_analysis=self.opt.cutoff_analysis )
 389         scorecalculator.destroy()
 390         return peaks
 391
 392     # def __diag_w_control (self):
 393     #     # sample
 394     #     sample_peaks = {}
 395     #     for i in xrange(90,10,-10):
 396     #         self.info("#3 diag: sample %d%%" % i)
 397     #         sample_peaks[i]=self.__diag_peakfinding_w_control_sample(float(i)/(i+10))
 398     #     return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10)
 399
 400     # def __diag_peakfinding_w_control_sample (self, percent):
 401     #     self.treat.sample(percent) # because sampling is after
 402     #                                # shifting, track.total is used
 403     #                                # now.
 404     #     self.control.sample(percent)
 405     #     ratio_treat2control = float(self.treat.total)/self.control.total
 406
 407     #     #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed...
 408     #     #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1
 409
 410     #     self.debug("#3 diag: after shift and merging, treat: %d, control: %d" % (self.treat.total,self.control.total))
 411     #     self.info("#3 diag: call peak candidates")
 412     #     peak_candidates = self.__call_peaks_from_trackI (self.treat)
 413
 414     #     self.info("#3 diag: call negative peak candidates")
 415     #     negative_peak_candidates = self.__call_peaks_from_trackI (self.control)
 416
 417     #     self.info("#3 diag: use control data to filter peak candidates...")
 418     #     final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.control, ratio_treat2control)
 419     #     return final_peaks_percent
 420
 421     # def __diag_wo_control (self):
 422     #     # sample
 423     #     sample_peaks = {}
 424     #     for i in xrange(90,10,-10):
 425     #         self.info("#3 diag: sample %d%%" % i)
 426     #         sample_peaks[i]=self.__diag_peakfinding_wo_control_sample(float(i)/(i+10))
 427     #     return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10)
 428
 429     # def __diag_peakfinding_wo_control_sample (self, percent):
 430
 431     #     #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed...
 432     #     #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1
 433
 434     #     self.treat.sample(percent)
 435     #     self.debug("#3 diag: after shift and merging, tags: %d" % (self.treat.total))
 436     #     self.info("#3 diag: call peak candidates")
 437     #     peak_candidates = self.__call_peaks_from_trackI (self.treat)
 438     #     self.info("#3 diag: use self to calculate local lambda and  filter peak candidates...")
 439     #     final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.treat,pass_sregion=True) # bug fixed...
 440     #     return final_peaks_percent
 441
 442     # def __overlap (self, gold_peaks, sample_peaks, top=90,bottom=10,step=-10):
 443     #     """Calculate the overlap between several fe range for the
 444     #     golden peaks set and results from sampled data.
 445
 446     #     """
 447     #     gp = PeakIO()
 448     #     gp.init_from_dict(gold_peaks)
 449     #     if self.femax:
 450     #         femax = min(self.femax, (int(gp.max_fold_enrichment())//self.festep+1)*self.festep)
 451     #     else:
 452     #         femax = (int(gp.max_fold_enrichment())//self.festep+1)*self.festep
 453     #     femin = self.femin
 454     #     diag_result = []
 455     #     for f in xrange(femin, femax, self.festep):
 456
 457     #         fe_low = f
 458     #         fe_up = f + self.festep
 459     #         self.debug("#3 diag: fe range = %d -- %d" % (fe_low, fe_up))
 460
 461     #         r = self.__overlap_fe(gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step)
 462     #         if r:
 463     #             diag_result.append(r)
 464     #     return diag_result
 465
 466     # def __overlap_fe (self, gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step):
 467     #     ret = ["%d-%d" % (fe_low,fe_up)]
 468     #     gp = PeakIO()
 469     #     gp.init_from_dict(gold_peaks)
 470     #     gp.filter_fc(fe_low,fe_up)
 471     #     gptotal =  gp.total()
 472     #     if gptotal <= 0:
 473     #         return None
 474
 475     #     ret.append(gptotal)
 476     #     for i in xrange(top,bottom,step):
 477     #         p = PeakIO()
 478     #         p.init_from_dict(sample_peaks[i])
 479     #         percent = 100.0*gp.overlap_with_other_peaks(p)/gptotal
 480     #         ret.append(percent)
 481     #         del p
 482     #     return ret
 483
 484
 485     # def __remove_overlapping_peaks (self, peaks ):
 486     #     """peak_candidates[chrom] = [(peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags)...]
 487
 488     #     """
 489     #     new_peaks = {}
 490     #     chrs = peaks.keys()
 491     #     chrs.sort()
 492     #     for chrom in chrs:
 493     #         new_peaks[chrom]=[]
 494     #         n_append = new_peaks[chrom].append
 495     #         prev_peak = None
 496     #         peaks_chr = peaks[chrom]
 497     #         for i in xrange(len(peaks_chr)):
 498     #             if not prev_peak:
 499     #                 prev_peak = peaks_chr[i]
 500     #                 continue
 501     #             else:
 502     #                 if peaks_chr[i][0] <= prev_peak[1]:
 503     #                     s_new_peak = prev_peak[0]
 504     #                     e_new_peak = peaks_chr[i][1]
 505     #                     l_new_peak = e_new_peak-s_new_peak
 506     #                     if peaks_chr[i][4] > prev_peak[4]:
 507     #                         summit_new_peak = peaks_chr[i][3]
 508     #                         h_new_peak = peaks_chr[i][4]
 509     #                     else:
 510     #                         summit_new_peak = prev_peak[3]
 511     #                         h_new_peak = prev_peak[4]
 512     #                     prev_peak = (s_new_peak,e_new_peak,l_new_peak,summit_new_peak,h_new_peak,peaks_chr[i][5]+prev_peak[5])
 513     #                 else:
 514     #                     n_append(prev_peak)
 515     #                     prev_peak = peaks_chr[i]
 516     #         if prev_peak:
 517     #             n_append(prev_peak)
 518     #     return new_peaks
 519