1 # cython: language_level=3
3 # Time-stamp: <2019-10-30 16:33:34 taoliu>
5 """Module Description: Detect peaks, main module
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 from itertools
import groupby
13 from operator
import itemgetter
15 import gc
# use garbage collectior
17 from MACS2
.IO
.PeakIO
import PeakIO
18 from MACS2
.IO
.BedGraphIO
import bedGraphIO
19 from MACS2
.Constants
import *
20 from MACS2
.IO
.CallPeakUnit
import CallerFromAlignments
22 cdef bytes subpeak_letters
(short i
):
24 return chr(97+i
).encode
()
26 return subpeak_letters
(i
// 26) + chr(97 + (i
% 26)).encode
()
29 """Class to do the peak calling.
32 >>> from MACS2.cPeakDetect import cPeakDetect
33 >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000)
36 def __init__
(self,opt
= None
,treat
= None
, control
= None
, d
= None
,
37 maxgap
= None
, minlen
= None
, slocal
= None
, llocal
= None
):
38 """Initialize the PeakDetect object.
43 self.debug
= opt
.debug
47 self.control
= control
48 self.ratio_treat2control
= None
50 self.final_peaks
= None
51 self.PE_MODE
= opt
.PE_MODE
52 self.scoretrack
= None
54 #self.femax = opt.femax
55 #self.femin = opt.femin
56 #self.festep = opt.festep
58 self.log_pvalue
= opt
.log_pvalue
# -log10pvalue
59 self.log_qvalue
= opt
.log_qvalue
# -log10qvalue
66 self.maxgap
= opt
.maxgap
68 self.maxgap
= opt
.tsize
71 self.minlen
= opt
.minlen
75 self.end_shift
= self.opt
.shift
76 self.gsize
= opt
.gsize
78 self.nolambda
= opt
.nolambda
83 self.sregion
= opt
.smalllocal
88 self.lregion
= opt
.largelocal
91 self.info
("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!")
93 #self.save_score = opt.store_score
94 #self.zwig_tr = opt.zwig_tr
95 #self.zwig_ctl= opt.zwig_ctl
97 def call_peaks
(self):
98 """Call peaks function.
100 Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
101 self.final_peaks and self.final_negative_peaks.
103 if self.control
: # w/ control
105 # (self.peaks,self.broadpeaks) = self.__call_peaks_w_control()
107 self.peaks
= self.__call_peaks_w_control
()
110 # (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control()
112 self.peaks
= self.__call_peaks_wo_control
()
115 def __call_peaks_w_control
(self):
116 """To call peaks with control data.
118 A peak info type is a: dictionary
120 key value: chromosome
122 items: (peak start,peak end, peak length, peak summit, peak
123 height, number of tags in peak region, peak pvalue, peak
124 fold_enrichment) <-- tuple type
126 While calculating pvalue:
128 First, t and c will be adjusted by the ratio between total
129 reads in treatment and total reads in control, depending on
132 Then, t and c will be multiplied by the smallest peak size --
135 Finally, a poisson CDF is applied to calculate one-side pvalue
140 float lambda_bg
, effective_depth_in_million
142 list ctrl_scale_s
, ctrl_d_s
143 long treat_total
, control_total
144 long treat_sum
# approx sum of treatment pileup values
145 long control_sum
# approx sum of control pileup values
147 treat_total
= self.treat
.total
150 d
= self.treat
.average_template_length
151 control_total
= self.control
.total
* 2 # in PE mode, entire fragment is counted as 1
152 # in treatment whereas both ends of fragment are counted in control/input.
153 treat_sum
= self.treat
.length
154 control_sum
= control_total
* self.treat
.average_template_length
155 self.ratio_treat2control
= float(treat_sum
)/control_sum
158 control_total
= self.control
.total
159 treat_sum
= self.treat
.total
* self.d
160 control_sum
= self.control
.total
* self.d
161 self.ratio_treat2control
= float(treat_sum
)/control_sum
163 if self.opt
.ratio
!= 1.0:
164 self.ratio_treat2control
= self.opt
.ratio
166 if self.opt
.tocontrol
:
167 # if MACS decides to scale treatment to control data because treatment is bigger
168 effective_depth_in_million
= control_total
/ 1000000.0
169 lambda_bg
= float( control_sum
)/ self.gsize
170 treat_scale
= 1/self.ratio_treat2control
172 # if MACS decides to scale control to treatment because control sample is bigger
173 effective_depth_in_million
= treat_total
/ 1000000.0
174 lambda_bg
= float( treat_sum
)/ self.gsize
177 # prepare d_s for control data
179 assert self.d
<= self.sregion
, "slocal can't be smaller than d!"
181 assert self.d
<= self.lregion
, "llocal can't be smaller than d!"
182 assert self.sregion
<= self.lregion
, "llocal can't be smaller than slocal!"
184 # Now prepare a list of extension sizes
185 ctrl_d_s
= [ self.d
] # note, d doesn't make sense in PE mode.
186 # And a list of scaling factors for control
190 if not self.opt
.tocontrol
:
191 # if user wants to scale everything to ChIP data
192 tmp_v
= self.ratio_treat2control
195 ctrl_scale_s
.append
( tmp_v
)
199 ctrl_d_s
.append
( self.sregion
)
200 if not self.opt
.tocontrol
:
201 # if user want to scale everything to ChIP data
202 tmp_v
= float(self.d
)/self.sregion
*self.ratio_treat2control
204 tmp_v
= float(self.d
)/self.sregion
205 ctrl_scale_s
.append
( tmp_v
)
208 if self.lregion
and self.lregion
> self.sregion
:
209 ctrl_d_s
.append
( self.lregion
)
210 if not self.opt
.tocontrol
:
211 # if user want to scale everything to ChIP data
212 tmp_v
= float(self.d
)/self.lregion
*self.ratio_treat2control
214 tmp_v
= float(self.d
)/self.lregion
215 ctrl_scale_s
.append
( tmp_v
)
217 #if self.PE_MODE: # first d/scale are useless in PE mode
218 # ctrl_d_s = ctrl_d_s[1:]
219 # ctrl_scale_s = ctrl_scale_s[1:]
226 scorecalculator
= CallerFromAlignments
( self.treat
, self.control
,
227 d
= d
, ctrl_d_s
= ctrl_d_s
,
228 treat_scaling_factor
= treat_scale
,
229 ctrl_scaling_factor_s
= ctrl_scale_s
,
230 end_shift
= self.end_shift
,
231 lambda_bg
= lambda_bg
,
232 save_bedGraph
= self.opt
.store_bdg
,
233 bedGraph_filename_prefix
= self.opt
.name
,
234 bedGraph_treat_filename
= self.opt
.bdg_treat
,
235 bedGraph_control_filename
= self.opt
.bdg_control
,
236 save_SPMR
= self.opt
.do_SPMR
,
237 cutoff_analysis_filename
= self.opt
.cutoff_analysis_file
)
239 if self.opt
.trackline
: scorecalculator
.enable_trackline
()
242 call_summits
= self.opt
.call_summits
243 if call_summits
: self.info
("#3 Going to call summits inside each peak ...")
245 if self.log_pvalue
!= None
:
247 self.info
("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue
,self.opt
.log_broadcutoff
) )
248 peaks
= scorecalculator
.call_broadpeaks
(['p',],
249 lvl1_cutoff_s
=[self.log_pvalue
,],
250 lvl2_cutoff_s
=[self.opt
.log_broadcutoff
,],
251 min_length
=self.minlen
,
252 lvl1_max_gap
=self.maxgap
,
253 lvl2_max_gap
=self.maxgap
*4,
254 cutoff_analysis
=self.opt
.cutoff_analysis
)
256 self.info
("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue
)
257 peaks
= scorecalculator
.call_peaks
( ['p',], [self.log_pvalue
,],
258 min_length
=self.minlen
,
260 call_summits
=call_summits
,
261 cutoff_analysis
=self.opt
.cutoff_analysis
)
262 elif self.log_qvalue
!= None
:
264 self.info
("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue
,self.opt
.log_broadcutoff
) )
265 peaks
= scorecalculator
.call_broadpeaks
(['q',],
266 lvl1_cutoff_s
=[self.log_qvalue
,],
267 lvl2_cutoff_s
=[self.opt
.log_broadcutoff
,],
268 min_length
=self.minlen
,
269 lvl1_max_gap
=self.maxgap
,
270 lvl2_max_gap
=self.maxgap
*4,
271 cutoff_analysis
=self.opt
.cutoff_analysis
)
273 peaks
= scorecalculator
.call_peaks
( ['q',], [self.log_qvalue
,],
274 min_length
=self.minlen
,
276 call_summits
=call_summits
,
277 cutoff_analysis
=self.opt
.cutoff_analysis
)
278 scorecalculator
.destroy
()
281 def __call_peaks_wo_control
(self):
282 """To call peaks without control data.
284 A peak info type is a: dictionary
286 key value: chromosome
288 items: (peak start,peak end, peak length, peak summit, peak
289 height, number of tags in peak region, peak pvalue, peak
290 fold_enrichment) <-- tuple type
292 While calculating pvalue:
294 First, t and c will be adjusted by the ratio between total
295 reads in treatment and total reads in control, depending on
298 Then, t and c will be multiplied by the smallest peak size --
301 Finally, a poisson CDF is applied to calculate one-side pvalue
304 cdef float lambda_bg
, effective_depth_in_million
305 cdef float treat_scale
= 1
307 cdef list ctrl_scale_s
, ctrl_d_s
309 if self.PE_MODE
: d
= 0
311 treat_length
= self.treat
.length
312 treat_total
= self.treat
.total
314 effective_depth_in_million
= treat_total
/ 1000000.0
318 # # this an estimator, we should maybe test it for accuracy?
319 lambda_bg
= treat_length
/ self.gsize
321 lambda_bg
= float(d
) * treat_total
/ self.gsize
324 # slocal and d-size local bias are not calculated!
325 # nothing done here. should this match w control??
327 if not self.nolambda
:
329 ctrl_scale_s
= [ float(treat_length
) / (self.lregion
*treat_total
*2), ]
331 ctrl_scale_s
= [ float(self.d
) / self.lregion
, ]
332 ctrl_d_s
= [ self.lregion
, ]
337 scorecalculator
= CallerFromAlignments
( self.treat
, None
,
338 d
= d
, ctrl_d_s
= ctrl_d_s
,
339 treat_scaling_factor
= treat_scale
,
340 ctrl_scaling_factor_s
= ctrl_scale_s
,
341 end_shift
= self.end_shift
,
342 lambda_bg
= lambda_bg
,
343 save_bedGraph
= self.opt
.store_bdg
,
344 bedGraph_filename_prefix
= self.opt
.name
,
345 bedGraph_treat_filename
= self.opt
.bdg_treat
,
346 bedGraph_control_filename
= self.opt
.bdg_control
,
347 save_SPMR
= self.opt
.do_SPMR
,
348 cutoff_analysis_filename
= self.opt
.cutoff_analysis_file
)
350 if self.opt
.trackline
: scorecalculator
.enable_trackline
()
353 call_summits
= self.opt
.call_summits
354 if call_summits
: self.info
("#3 Going to call summits inside each peak ...")
356 if self.log_pvalue
!= None
:
358 self.info
("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue
,self.opt
.log_broadcutoff
) )
359 peaks
= scorecalculator
.call_broadpeaks
(['p',],
360 lvl1_cutoff_s
=[self.log_pvalue
,],
361 lvl2_cutoff_s
=[self.opt
.log_broadcutoff
,],
362 min_length
=self.minlen
,
363 lvl1_max_gap
=self.maxgap
,
364 lvl2_max_gap
=self.maxgap
*4,
365 cutoff_analysis
=self.opt
.cutoff_analysis
)
367 self.info
("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue
)
368 peaks
= scorecalculator
.call_peaks
( ['p',], [self.log_pvalue
,],
369 min_length
=self.minlen
,
371 call_summits
=call_summits
,
372 cutoff_analysis
=self.opt
.cutoff_analysis
)
373 elif self.log_qvalue
!= None
:
375 self.info
("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue
,self.opt
.log_broadcutoff
) )
376 peaks
= scorecalculator
.call_broadpeaks
(['q',],
377 lvl1_cutoff_s
=[self.log_qvalue
,],
378 lvl2_cutoff_s
=[self.opt
.log_broadcutoff
,],
379 min_length
=self.minlen
,
380 lvl1_max_gap
=self.maxgap
,
381 lvl2_max_gap
=self.maxgap
*4,
382 cutoff_analysis
=self.opt
.cutoff_analysis
)
384 peaks
= scorecalculator
.call_peaks
( ['q',], [self.log_qvalue
,],
385 min_length
=self.minlen
,
387 call_summits
=call_summits
,
388 cutoff_analysis
=self.opt
.cutoff_analysis
)
389 scorecalculator
.destroy
()
392 # def __diag_w_control (self):
395 # for i in xrange(90,10,-10):
396 # self.info("#3 diag: sample %d%%" % i)
397 # sample_peaks[i]=self.__diag_peakfinding_w_control_sample(float(i)/(i+10))
398 # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10)
400 # def __diag_peakfinding_w_control_sample (self, percent):
401 # self.treat.sample(percent) # because sampling is after
402 # # shifting, track.total is used
404 # self.control.sample(percent)
405 # ratio_treat2control = float(self.treat.total)/self.control.total
407 # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed...
408 # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1
410 # self.debug("#3 diag: after shift and merging, treat: %d, control: %d" % (self.treat.total,self.control.total))
411 # self.info("#3 diag: call peak candidates")
412 # peak_candidates = self.__call_peaks_from_trackI (self.treat)
414 # self.info("#3 diag: call negative peak candidates")
415 # negative_peak_candidates = self.__call_peaks_from_trackI (self.control)
417 # self.info("#3 diag: use control data to filter peak candidates...")
418 # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.control, ratio_treat2control)
419 # return final_peaks_percent
421 # def __diag_wo_control (self):
424 # for i in xrange(90,10,-10):
425 # self.info("#3 diag: sample %d%%" % i)
426 # sample_peaks[i]=self.__diag_peakfinding_wo_control_sample(float(i)/(i+10))
427 # return self.__overlap (self.final_peaks, sample_peaks,top=90,bottom=10,step=-10)
429 # def __diag_peakfinding_wo_control_sample (self, percent):
431 # #self.lambda_bg = float(self.scan_window)*self.treat.total/self.gsize # bug fixed...
432 # #self.min_tags = poisson_cdf_inv(1-pow(10,self.pvalue/-10),self.lambda_bg)+1
434 # self.treat.sample(percent)
435 # self.debug("#3 diag: after shift and merging, tags: %d" % (self.treat.total))
436 # self.info("#3 diag: call peak candidates")
437 # peak_candidates = self.__call_peaks_from_trackI (self.treat)
438 # self.info("#3 diag: use self to calculate local lambda and filter peak candidates...")
439 # final_peaks_percent = self.__filter_w_control(peak_candidates,self.treat,self.treat,pass_sregion=True) # bug fixed...
440 # return final_peaks_percent
442 # def __overlap (self, gold_peaks, sample_peaks, top=90,bottom=10,step=-10):
443 # """Calculate the overlap between several fe range for the
444 # golden peaks set and results from sampled data.
448 # gp.init_from_dict(gold_peaks)
450 # femax = min(self.femax, (int(gp.max_fold_enrichment())//self.festep+1)*self.festep)
452 # femax = (int(gp.max_fold_enrichment())//self.festep+1)*self.festep
455 # for f in xrange(femin, femax, self.festep):
458 # fe_up = f + self.festep
459 # self.debug("#3 diag: fe range = %d -- %d" % (fe_low, fe_up))
461 # r = self.__overlap_fe(gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step)
463 # diag_result.append(r)
466 # def __overlap_fe (self, gold_peaks, sample_peaks, fe_low, fe_up, top, bottom, step):
467 # ret = ["%d-%d" % (fe_low,fe_up)]
469 # gp.init_from_dict(gold_peaks)
470 # gp.filter_fc(fe_low,fe_up)
471 # gptotal = gp.total()
475 # ret.append(gptotal)
476 # for i in xrange(top,bottom,step):
478 # p.init_from_dict(sample_peaks[i])
479 # percent = 100.0*gp.overlap_with_other_peaks(p)/gptotal
480 # ret.append(percent)
485 # def __remove_overlapping_peaks (self, peaks ):
486 # """peak_candidates[chrom] = [(peak_start,peak_end,peak_length,peak_summit,peak_height,number_cpr_tags)...]
490 # chrs = peaks.keys()
493 # new_peaks[chrom]=[]
494 # n_append = new_peaks[chrom].append
496 # peaks_chr = peaks[chrom]
497 # for i in xrange(len(peaks_chr)):
499 # prev_peak = peaks_chr[i]
502 # if peaks_chr[i][0] <= prev_peak[1]:
503 # s_new_peak = prev_peak[0]
504 # e_new_peak = peaks_chr[i][1]
505 # l_new_peak = e_new_peak-s_new_peak
506 # if peaks_chr[i][4] > prev_peak[4]:
507 # summit_new_peak = peaks_chr[i][3]
508 # h_new_peak = peaks_chr[i][4]
510 # summit_new_peak = prev_peak[3]
511 # h_new_peak = prev_peak[4]
512 # prev_peak = (s_new_peak,e_new_peak,l_new_peak,summit_new_peak,h_new_peak,peaks_chr[i][5]+prev_peak[5])
514 # n_append(prev_peak)
515 # prev_peak = peaks_chr[i]
517 # n_append(prev_peak)