Merge pull request #363 from taoliu/fix_test_bdgopt
[MACS.git] / MACS2 / PeakDetect.pyx
blobbd759b53f2892ecdfa314dbe75d32872492358e8
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2019-10-30 16:33:34 taoliu>
5 """Module Description: Detect peaks, main module
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
9 the distribution).
10 """
12 from itertools import groupby
13 from operator import itemgetter
14 import io
15 import gc # use garbage collectior
17 from MACS2.IO.PeakIO import PeakIO
18 from MACS2.IO.BedGraphIO import bedGraphIO
19 from MACS2.Constants import *
20 from MACS2.IO.CallPeakUnit import CallerFromAlignments
22 cdef bytes subpeak_letters(short i):
23 if i < 26:
24 return chr(97+i).encode()
25 else:
26 return subpeak_letters(i // 26) + chr(97 + (i % 26)).encode()
28 class PeakDetect:
29 """Class to do the peak calling.
31 e.g
32 >>> from MACS2.cPeakDetect import cPeakDetect
33 >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000)
34 >>> pd.call_peaks()
35 """
36 def __init__ (self,opt = None,treat = None, control = None, d = None,
37 maxgap = None, minlen = None, slocal = None, llocal = None):
38 """Initialize the PeakDetect object.
40 """
41 self.opt = opt
42 self.info = opt.info
43 self.debug = opt.debug
44 self.warn = opt.warn
46 self.treat = treat
47 self.control = control
48 self.ratio_treat2control = None
49 self.peaks = None
50 self.final_peaks = None
51 self.PE_MODE = opt.PE_MODE
52 self.scoretrack = None
54 #self.femax = opt.femax
55 #self.femin = opt.femin
56 #self.festep = opt.festep
58 self.log_pvalue = opt.log_pvalue # -log10pvalue
59 self.log_qvalue = opt.log_qvalue # -log10qvalue
60 if d != None:
61 self.d = d
62 else:
63 self.d = self.opt.d
65 if opt.maxgap:
66 self.maxgap = opt.maxgap
67 else:
68 self.maxgap = opt.tsize
70 if opt.minlen:
71 self.minlen = opt.minlen
72 else:
73 self.minlen = self.d
75 self.end_shift = self.opt.shift
76 self.gsize = opt.gsize
78 self.nolambda = opt.nolambda
80 if slocal != None:
81 self.sregion = slocal
82 else:
83 self.sregion = opt.smalllocal
85 if llocal != None:
86 self.lregion = llocal
87 else:
88 self.lregion = opt.largelocal
90 if (self.nolambda):
91 self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!")
92 #self.diag = opt.diag
93 #self.save_score = opt.store_score
94 #self.zwig_tr = opt.zwig_tr
95 #self.zwig_ctl= opt.zwig_ctl
97 def call_peaks (self):
98 """Call peaks function.
100 Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
101 self.final_peaks and self.final_negative_peaks.
103 if self.control: # w/ control
104 #if self.opt.broad:
105 # (self.peaks,self.broadpeaks) = self.__call_peaks_w_control()
106 #else:
107 self.peaks = self.__call_peaks_w_control ()
108 else: # w/o control
109 #if self.opt.broad:
110 # (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control()
111 #else:
112 self.peaks = self.__call_peaks_wo_control ()
113 return self.peaks
115 def __call_peaks_w_control (self):
116 """To call peaks with control data.
118 A peak info type is a: dictionary
120 key value: chromosome
122 items: (peak start,peak end, peak length, peak summit, peak
123 height, number of tags in peak region, peak pvalue, peak
124 fold_enrichment) <-- tuple type
126 While calculating pvalue:
128 First, t and c will be adjusted by the ratio between total
129 reads in treatment and total reads in control, depending on
130 --to-small option.
132 Then, t and c will be multiplied by the smallest peak size --
133 self.d.
135 Finally, a poisson CDF is applied to calculate one-side pvalue
136 for enrichment.
138 cdef:
139 int i
140 float lambda_bg, effective_depth_in_million
141 float treat_scale, d
142 list ctrl_scale_s, ctrl_d_s
143 long treat_total, control_total
144 long treat_sum # approx sum of treatment pileup values
145 long control_sum # approx sum of control pileup values
147 treat_total = self.treat.total
149 if self.PE_MODE:
150 d = self.treat.average_template_length
151 control_total = self.control.total * 2 # in PE mode, entire fragment is counted as 1
152 # in treatment whereas both ends of fragment are counted in control/input.
153 treat_sum = self.treat.length
154 control_sum = control_total * self.treat.average_template_length
155 self.ratio_treat2control = float(treat_sum)/control_sum
156 else:
157 d = self.d
158 control_total = self.control.total
159 treat_sum = self.treat.total * self.d
160 control_sum = self.control.total * self.d
161 self.ratio_treat2control = float(treat_sum)/control_sum
163 if self.opt.ratio != 1.0:
164 self.ratio_treat2control = self.opt.ratio
166 if self.opt.tocontrol:
167 # if MACS decides to scale treatment to control data because treatment is bigger
168 effective_depth_in_million = control_total / 1000000.0
169 lambda_bg = float( control_sum )/ self.gsize
170 treat_scale = 1/self.ratio_treat2control
171 else:
172 # if MACS decides to scale control to treatment because control sample is bigger
173 effective_depth_in_million = treat_total / 1000000.0
174 lambda_bg = float( treat_sum )/ self.gsize
175 treat_scale = 1.0
177 # prepare d_s for control data
178 if self.sregion:
179 assert self.d <= self.sregion, "slocal can't be smaller than d!"
180 if self.lregion:
181 assert self.d <= self.lregion , "llocal can't be smaller than d!"
182 assert self.sregion <= self.lregion , "llocal can't be smaller than slocal!"
184 # Now prepare a list of extension sizes
185 ctrl_d_s = [ self.d ] # note, d doesn't make sense in PE mode.
186 # And a list of scaling factors for control
187 ctrl_scale_s = []
190 if not self.opt.tocontrol:
191 # if user wants to scale everything to ChIP data
192 tmp_v = self.ratio_treat2control
193 else:
194 tmp_v = 1.0
195 ctrl_scale_s.append( tmp_v )
197 # slocal size local
198 if self.sregion:
199 ctrl_d_s.append( self.sregion )
200 if not self.opt.tocontrol:
201 # if user want to scale everything to ChIP data
202 tmp_v = float(self.d)/self.sregion*self.ratio_treat2control
203 else:
204 tmp_v = float(self.d)/self.sregion
205 ctrl_scale_s.append( tmp_v )
207 # llocal size local
208 if self.lregion and self.lregion > self.sregion:
209 ctrl_d_s.append( self.lregion )
210 if not self.opt.tocontrol:
211 # if user want to scale everything to ChIP data
212 tmp_v = float(self.d)/self.lregion*self.ratio_treat2control
213 else:
214 tmp_v = float(self.d)/self.lregion
215 ctrl_scale_s.append( tmp_v )
217 #if self.PE_MODE: # first d/scale are useless in PE mode
218 # ctrl_d_s = ctrl_d_s[1:]
219 # ctrl_scale_s = ctrl_scale_s[1:]
220 # print ctrl_d_s
221 # print ctrl_scale_s
222 if self.nolambda:
223 ctrl_d_s = []
224 ctrl_scale_s = []
226 scorecalculator = CallerFromAlignments( self.treat, self.control,
227 d = d, ctrl_d_s = ctrl_d_s,
228 treat_scaling_factor = treat_scale,
229 ctrl_scaling_factor_s = ctrl_scale_s,
230 end_shift = self.end_shift,
231 lambda_bg = lambda_bg,
232 save_bedGraph = self.opt.store_bdg,
233 bedGraph_filename_prefix = self.opt.name,
234 bedGraph_treat_filename = self.opt.bdg_treat,
235 bedGraph_control_filename = self.opt.bdg_control,
236 save_SPMR = self.opt.do_SPMR,
237 cutoff_analysis_filename = self.opt.cutoff_analysis_file )
239 if self.opt.trackline: scorecalculator.enable_trackline()
241 # call peaks
242 call_summits = self.opt.call_summits
243 if call_summits: self.info("#3 Going to call summits inside each peak ...")
245 if self.log_pvalue != None:
246 if self.opt.broad:
247 self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) )
248 peaks = scorecalculator.call_broadpeaks(['p',],
249 lvl1_cutoff_s=[self.log_pvalue,],
250 lvl2_cutoff_s=[self.opt.log_broadcutoff,],
251 min_length=self.minlen,
252 lvl1_max_gap=self.maxgap,
253 lvl2_max_gap=self.maxgap*4,
254 cutoff_analysis=self.opt.cutoff_analysis )
255 else:
256 self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
257 peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
258 min_length=self.minlen,
259 max_gap=self.maxgap,
260 call_summits=call_summits,
261 cutoff_analysis=self.opt.cutoff_analysis )
262 elif self.log_qvalue != None:
263 if self.opt.broad:
264 self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
265 peaks = scorecalculator.call_broadpeaks(['q',],
266 lvl1_cutoff_s=[self.log_qvalue,],
267 lvl2_cutoff_s=[self.opt.log_broadcutoff,],
268 min_length=self.minlen,
269 lvl1_max_gap=self.maxgap,
270 lvl2_max_gap=self.maxgap*4,
271 cutoff_analysis=self.opt.cutoff_analysis )
272 else:
273 peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
274 min_length=self.minlen,
275 max_gap=self.maxgap,
276 call_summits=call_summits,
277 cutoff_analysis=self.opt.cutoff_analysis )
278 scorecalculator.destroy()
279 return peaks
281 def __call_peaks_wo_control (self):
282 """To call peaks without control data.
284 A peak info type is a: dictionary
286 key value: chromosome
288 items: (peak start,peak end, peak length, peak summit, peak
289 height, number of tags in peak region, peak pvalue, peak
290 fold_enrichment) <-- tuple type
292 While calculating pvalue:
294 First, t and c will be adjusted by the ratio between total
295 reads in treatment and total reads in control, depending on
296 --to-small option.
298 Then, t and c will be multiplied by the smallest peak size --
299 self.d.
301 Finally, a poisson CDF is applied to calculate one-side pvalue
302 for enrichment.
304 cdef float lambda_bg, effective_depth_in_million
305 cdef float treat_scale = 1
306 cdef float d
307 cdef list ctrl_scale_s, ctrl_d_s
309 if self.PE_MODE: d = 0
310 else: d = self.d
311 treat_length = self.treat.length
312 treat_total = self.treat.total
314 effective_depth_in_million = treat_total / 1000000.0
316 # global lambda
317 if self.PE_MODE:
318 # # this an estimator, we should maybe test it for accuracy?
319 lambda_bg = treat_length / self.gsize
320 else:
321 lambda_bg = float(d) * treat_total / self.gsize
322 treat_scale = 1.0
324 # slocal and d-size local bias are not calculated!
325 # nothing done here. should this match w control??
327 if not self.nolambda:
328 if self.PE_MODE:
329 ctrl_scale_s = [ float(treat_length) / (self.lregion*treat_total*2), ]
330 else:
331 ctrl_scale_s = [ float(self.d) / self.lregion, ]
332 ctrl_d_s = [ self.lregion, ]
333 else:
334 ctrl_scale_s = []
335 ctrl_d_s = []
337 scorecalculator = CallerFromAlignments( self.treat, None,
338 d = d, ctrl_d_s = ctrl_d_s,
339 treat_scaling_factor = treat_scale,
340 ctrl_scaling_factor_s = ctrl_scale_s,
341 end_shift = self.end_shift,
342 lambda_bg = lambda_bg,
343 save_bedGraph = self.opt.store_bdg,
344 bedGraph_filename_prefix = self.opt.name,
345 bedGraph_treat_filename = self.opt.bdg_treat,
346 bedGraph_control_filename = self.opt.bdg_control,
347 save_SPMR = self.opt.do_SPMR,
348 cutoff_analysis_filename = self.opt.cutoff_analysis_file )
350 if self.opt.trackline: scorecalculator.enable_trackline()
352 # call peaks
353 call_summits = self.opt.call_summits
354 if call_summits: self.info("#3 Going to call summits inside each peak ...")
356 if self.log_pvalue != None:
357 if self.opt.broad:
358 self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) )
359 peaks = scorecalculator.call_broadpeaks(['p',],
360 lvl1_cutoff_s=[self.log_pvalue,],
361 lvl2_cutoff_s=[self.opt.log_broadcutoff,],
362 min_length=self.minlen,
363 lvl1_max_gap=self.maxgap,
364 lvl2_max_gap=self.maxgap*4,
365 cutoff_analysis=self.opt.cutoff_analysis )
366 else:
367 self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
368 peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
369 min_length=self.minlen,
370 max_gap=self.maxgap,
371 call_summits=call_summits,
372 cutoff_analysis=self.opt.cutoff_analysis )
373 elif self.log_qvalue != None:
374 if self.opt.broad:
375 self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
376 peaks = scorecalculator.call_broadpeaks(['q',],
377 lvl1_cutoff_s=[self.log_qvalue,],
378 lvl2_cutoff_s=[self.opt.log_broadcutoff,],
379 min_length=self.minlen,
380 lvl1_max_gap=self.maxgap,
381 lvl2_max_gap=self.maxgap*4,
382 cutoff_analysis=self.opt.cutoff_analysis )
383 else:
384 peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
385 min_length=self.minlen,
386 max_gap=self.maxgap,
387 call_summits=call_summits,
388 cutoff_analysis=self.opt.cutoff_analysis )
389 scorecalculator.destroy()
390 return peaks
392 # def __diag_w_control (self):
393 # # sample
394 # sample_peaks = {}
395 # for i in xrange(90,10,-10):
396 # self.info("#3 diag: sample %d%%" % i)
397 # sample_peaks[i]=self.__diag_peakfinding_w_control_sample(float(i)/(i+10))
398 # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10)
400 # def __diag_peakfinding_w_control_sample (self, percent):
401 # self.treat.sample(percent) # because sampling is after
402 # # shifting, track.total is used
403 # # now.
404 # self.control.sample(percent)
405 # ratio_treat2control = float(self.treat.total)/self.control.total
407 # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed...
408 # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1
410 # self.debug("#3 diag: after shift and merging, treat: %d, control: %d" % (self.treat.total,self.control.total))
411 # self.info("#3 diag: call peak candidates")
412 # peak_candidates = self.__call_peaks_from_trackI (self.treat)
414 # self.info("#3 diag: call negative peak candidates")
415 # negative_peak_candidates = self.__call_peaks_from_trackI (self.control)
417 # self.info("#3 diag: use control data to filter peak candidates...")
418 # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.control, ratio_treat2control)
419 # return final_peaks_percent
421 # def __diag_wo_control (self):
422 # # sample
423 # sample_peaks = {}
424 # for i in xrange(90,10,-10):
425 # self.info("#3 diag: sample %d%%" % i)
426 # sample_peaks[i]=self.__diag_peakfinding_wo_control_sample(float(i)/(i+10))
427 # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10)
429 # def __diag_peakfinding_wo_control_sample (self, percent):
431 # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed...
432 # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1
434 # self.treat.sample(percent)
435 # self.debug("#3 diag: after shift and merging, tags: %d" % (self.treat.total))
436 # self.info("#3 diag: call peak candidates")
437 # peak_candidates = self.__call_peaks_from_trackI (self.treat)
438 # self.info("#3 diag: use self to calculate local lambda and filter peak candidates...")
439 # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.treat,pass_sregion=True) # bug fixed...
440 # return final_peaks_percent
442 # def __overlap (self, gold_peaks, sample_peaks, top=90,bottom=10,step=-10):
443 # """Calculate the overlap between several fe range for the
444 # golden peaks set and results from sampled data.
446 # """
447 # gp = PeakIO()
448 # gp.init_from_dict(gold_peaks)
449 # if self.femax:
450 # femax = min(self.femax, (int(gp.max_fold_enrichment())//self.festep+1)*self.festep)
451 # else:
452 # femax = (int(gp.max_fold_enrichment())//self.festep+1)*self.festep
453 # femin = self.femin
454 # diag_result = []
455 # for f in xrange(femin, femax, self.festep):
457 # fe_low = f
458 # fe_up = f + self.festep
459 # self.debug("#3 diag: fe range = %d -- %d" % (fe_low, fe_up))
461 # r = self.__overlap_fe(gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step)
462 # if r:
463 # diag_result.append(r)
464 # return diag_result
466 # def __overlap_fe (self, gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step):
467 # ret = ["%d-%d" % (fe_low,fe_up)]
468 # gp = PeakIO()
469 # gp.init_from_dict(gold_peaks)
470 # gp.filter_fc(fe_low,fe_up)
471 # gptotal = gp.total()
472 # if gptotal <= 0:
473 # return None
475 # ret.append(gptotal)
476 # for i in xrange(top,bottom,step):
477 # p = PeakIO()
478 # p.init_from_dict(sample_peaks[i])
479 # percent = 100.0*gp.overlap_with_other_peaks(p)/gptotal
480 # ret.append(percent)
481 # del p
482 # return ret
485 # def __remove_overlapping_peaks (self, peaks ):
486 # """peak_candidates[chrom] = [(peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags)...]
488 # """
489 # new_peaks = {}
490 # chrs = peaks.keys()
491 # chrs.sort()
492 # for chrom in chrs:
493 # new_peaks[chrom]=[]
494 # n_append = new_peaks[chrom].append
495 # prev_peak = None
496 # peaks_chr = peaks[chrom]
497 # for i in xrange(len(peaks_chr)):
498 # if not prev_peak:
499 # prev_peak = peaks_chr[i]
500 # continue
501 # else:
502 # if peaks_chr[i][0] <= prev_peak[1]:
503 # s_new_peak = prev_peak[0]
504 # e_new_peak = peaks_chr[i][1]
505 # l_new_peak = e_new_peak-s_new_peak
506 # if peaks_chr[i][4] > prev_peak[4]:
507 # summit_new_peak = peaks_chr[i][3]
508 # h_new_peak = peaks_chr[i][4]
509 # else:
510 # summit_new_peak = prev_peak[3]
511 # h_new_peak = prev_peak[4]
512 # prev_peak = (s_new_peak,e_new_peak,l_new_peak,summit_new_peak,h_new_peak,peaks_chr[i][5]+prev_peak[5])
513 # else:
514 # n_append(prev_peak)
515 # prev_peak = peaks_chr[i]
516 # if prev_peak:
517 # n_append(prev_peak)
518 # return new_peaks