Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / CallPeakUnit.pyx
blobc6ffb7b856adb8e1155d9d5c4bec1fbe0179217f
1 # cython: language_level=3
2 # cython: profile=True
3 # cython: linetrace=True
4 # Time-stamp: <2022-09-15 17:06:17 Tao Liu>
6 """Module for Calculate Scores.
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the BSD License (see the file LICENSE included with
10 the distribution).
11 """
13 # ------------------------------------
14 # python modules
15 # ------------------------------------
17 from collections import Counter
18 from copy import copy
19 from time import time as ttime
20 import _pickle as cPickle
21 from tempfile import mkstemp
22 import os
24 import logging
25 import MACS3.Utilities.Logger
27 logger = logging.getLogger(__name__)
28 debug = logger.debug
29 info = logger.info
30 # ------------------------------------
31 # Other modules
32 # ------------------------------------
33 import numpy as np
34 cimport numpy as np
35 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
36 from cpython cimport bool
37 from cykhash import PyObjectMap, Float32to32Map
39 # ------------------------------------
40 # C lib
41 # ------------------------------------
42 from libc.stdio cimport *
43 from libc.math cimport exp,log,log10, M_LN10, log1p, erf, sqrt, floor, ceil
45 # ------------------------------------
46 # MACS3 modules
47 # ------------------------------------
48 from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness
49 from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname
50 from MACS3.Signal.FixWidthTrack import FWTrack
51 from MACS3.Signal.PairedEndTrack import PETrackI
52 from MACS3.Signal.Prob import poisson_cdf
53 # --------------------------------------------
54 # cached pscore function and LR_asym functions
55 # --------------------------------------------
56 pscore_dict = PyObjectMap()
57 logLR_dict = PyObjectMap()
59 cdef float32_t get_pscore ( tuple t ):
60 """t: tuple of ( lambda, observation )
61 """
62 cdef:
63 float32_t val
64 if t in pscore_dict:
65 return pscore_dict[ t ]
66 else:
67 # calculate and cache
68 val = -1.0 * poisson_cdf ( t[0], t[1], False, True )
69 pscore_dict[ t ] = val
70 return val
72 cdef float32_t get_logLR_asym ( tuple t ):
73 """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
74 chromatin bias ). Set minus sign for depletion.
75 """
76 cdef:
77 float32_t val
78 float32_t x
79 float32_t y
80 if t in logLR_dict:
81 return logLR_dict[ t ]
82 else:
83 x = t[0]
84 y = t[1]
85 # calculate and cache
86 if x > y:
87 val = (x*(log10(x)-log10(y))+y-x)
88 elif x < y:
89 val = (x*(-log10(x)+log10(y))-y+x)
90 else:
91 val = 0
92 logLR_dict[ t ] = val
93 return val
95 # ------------------------------------
96 # constants
97 # ------------------------------------
98 __version__ = "CallPeakUnit $Revision$"
99 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
100 __doc__ = "CallPeakUnit"
102 LOG10_E = 0.43429448190325176
104 # ------------------------------------
105 # Misc functions
106 # ------------------------------------
108 cdef void clean_up_ndarray ( np.ndarray x ):
109 # clean numpy ndarray in two steps
110 cdef:
111 int64_t i
112 i = x.shape[0] // 2
113 x.resize( 100000 if i > 100000 else i, refcheck=False)
114 x.resize( 0, refcheck=False)
115 return
117 cdef inline float32_t chi2_k1_cdf ( float32_t x ):
118 return erf( sqrt(x/2) )
120 cdef inline float32_t log10_chi2_k1_cdf ( float32_t x ):
121 return log10( erf( sqrt(x/2) ) )
123 cdef inline float32_t chi2_k2_cdf ( float32_t x ):
124 return 1 - exp( -x/2 )
126 cdef inline float32_t log10_chi2_k2_cdf ( float32_t x ):
127 return log1p( - exp( -x/2 ) ) * LOG10_E
129 cdef inline float32_t chi2_k4_cdf ( float32_t x ):
130 return 1 - exp( -x/2 ) * ( 1 + x/2 )
132 cdef inline float32_t log10_chi2_k4_CDF ( float32_t x ):
133 return log1p( - exp( -x/2 ) * ( 1 + x/2 ) ) * LOG10_E
135 cdef inline np.ndarray apply_multiple_cutoffs ( list multiple_score_arrays, list multiple_cutoffs ):
136 cdef:
137 int32_t i
138 np.ndarray ret
140 ret = multiple_score_arrays[0] > multiple_cutoffs[0]
142 for i in range(1,len(multiple_score_arrays)):
143 ret += multiple_score_arrays[i] > multiple_cutoffs[i]
145 return ret
147 cdef inline list get_from_multiple_scores ( list multiple_score_arrays, int32_t index ):
148 cdef:
149 list ret = []
150 int32_t i
152 for i in range(len(multiple_score_arrays)):
153 ret.append(multiple_score_arrays[i][index])
154 return ret
157 cdef inline float32_t get_logFE ( float32_t x, float32_t y ):
158 """ return 100* log10 fold enrichment with +1 pseudocount.
160 return log10( x/y )
162 cdef inline float32_t get_subtraction ( float32_t x, float32_t y):
163 """ return subtraction.
165 return x - y
167 cdef inline list getitem_then_subtract ( list peakset, int32_t start ):
168 cdef:
169 list a
171 a = [x["start"] for x in peakset]
172 for i in range(len(a)):
173 a[i] = a[i] - start
174 return a
176 cdef inline int32_t left_sum ( data, int32_t pos, int32_t width ):
179 return sum([data[x] for x in data if x <= pos and x >= pos - width])
181 cdef inline int32_t right_sum ( data, int32_t pos, int32_t width ):
184 return sum([data[x] for x in data if x >= pos and x <= pos + width])
186 cdef inline int32_t left_forward ( data, int32_t pos, int32_t window_size ):
187 return data.get(pos,0) - data.get(pos-window_size, 0)
189 cdef inline int32_t right_forward ( data, int32_t pos, int32_t window_size ):
190 return data.get(pos + window_size, 0) - data.get(pos, 0)
192 cdef float32_t median_from_value_length ( np.ndarray[np.float32_t, ndim=1] value, list length ):
195 cdef:
196 list tmp
197 int32_t c, tmp_l
198 float32_t tmp_v, mid_l
200 c = 0
201 tmp = sorted(list(zip( value, length )))
202 mid_l = sum( length )/2
203 for (tmp_v, tmp_l) in tmp:
204 c += tmp_l
205 if c > mid_l:
206 return tmp_v
208 cdef float32_t mean_from_value_length ( np.ndarray[np.float32_t, ndim=1] value, list length ):
209 """take list of values and list of corresponding lengths, calculate the mean.
210 An important function for bedGraph type of data.
212 cdef:
213 int32_t i
214 int32_t tmp_l, l
215 float64_t tmp_v, sum_v, tmp_sum #try to solve precision issue
216 float32_t ret
218 sum_v = 0
219 l = 0
221 for i in range( len(length) ):
222 tmp_l = length[ i ]
223 tmp_v = <float64_t>value[ i ]
224 tmp_sum = tmp_v * tmp_l
225 sum_v = tmp_sum + sum_v
226 l += tmp_l
228 ret = <float32_t>(sum_v/l)
230 return ret
233 cdef tuple find_optimal_cutoff( list x, list y ):
234 """Return the best cutoff x and y.
236 We assume that total peak length increase exponentially while
237 decreasing cutoff value. But while cutoff decreases to a point
238 that background noises are captured, total length increases much
239 faster. So we fit a linear model by taking the first 10 points,
240 then look for the largest cutoff that
243 cdef:
244 np.ndarray npx, npy, npA
245 float32_t optimal_x, optimal_y
246 int64_t l, i
247 float32_t m, c # slop and intercept
248 float32_t sst # sum of squared total
249 float32_t sse # sum of squared error
250 float32_t rsq # R-squared
252 l = len(x)
253 assert l == len(y)
254 npx = np.array( x )
255 npy = np.log10( np.array( y ) )
256 npA = np.vstack( [npx, np.ones(len(npx))] ).T
258 for i in range( 10, l ):
259 # at least the largest 10 points
260 m, c = np.linalg.lstsq( npA[:i], npy[:i], rcond=None )[ 0 ]
261 sst = sum( ( npy[:i] - np.mean( npy[:i] ) ) ** 2 )
262 sse = sum( ( npy[:i] - m*npx[:i] - c ) ** 2 )
263 rsq = 1 - sse/sst
264 #print i, x[i], y[i], m, c, rsq
265 return ( 1.0, 1.0 )
269 # ------------------------------------
270 # Classes
271 # ------------------------------------
272 cdef class CallerFromAlignments:
273 """A unit to calculate scores and call peaks from alignments --
274 FWTrack or PETrack objects.
276 It will compute for each chromosome separately in order to save
277 memory usage.
279 cdef:
280 object treat # FWTrack or PETrackI object for ChIP
281 object ctrl # FWTrack or PETrackI object for Control
283 int32_t d # extension size for ChIP
284 list ctrl_d_s # extension sizes for Control. Can be multiple values
285 float32_t treat_scaling_factor # scaling factor for ChIP
286 list ctrl_scaling_factor_s # scaling factor for Control, corresponding to each extension size.
287 float32_t lambda_bg # minimum local bias to fill missing values
288 list chromosomes # name of common chromosomes in ChIP and Control data
289 float64_t pseudocount # the pseudocount used to calcuate logLR, FE or logFE
290 bytes bedGraph_filename_prefix # prefix will be added to _pileup.bdg for treatment and _lambda.bdg for control
292 int32_t end_shift # shift of cutting ends before extension
293 bool trackline # whether trackline should be saved in bedGraph
294 bool save_bedGraph # whether to save pileup and local bias in bedGraph files
295 bool save_SPMR # whether to save pileup normalized by sequencing depth in million reads
296 bool no_lambda_flag # whether ignore local bias, and to use global bias instead
297 bool PE_mode # whether it's in PE mode, will be detected during initiation
299 # temporary data buffer
300 list chr_pos_treat_ctrl # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
301 bytes bedGraph_treat_filename
302 bytes bedGraph_control_filename
303 FILE * bedGraph_treat_f
304 FILE * bedGraph_ctrl_f
306 # data needed to be pre-computed before peak calling
307 object pqtable # remember pvalue->qvalue convertion; saved in cykhash Float32to32Map
308 bool pvalue_all_done # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value.
310 dict pvalue_npeaks # record for each pvalue cutoff, how many peaks can be called
311 dict pvalue_length # record for each pvalue cutoff, the total length of called peaks
312 float32_t optimal_p_cutoff # automatically decide the p-value cutoff ( can be translated into qvalue cutoff ) based
313 # on p-value to total peak length analysis.
314 bytes cutoff_analysis_filename # file to save the pvalue-npeaks-totallength table
316 dict pileup_data_files # Record the names of temporary files for storing pileup values of each chromosome
319 def __init__ (self, treat, ctrl,
320 int32_t d = 200, list ctrl_d_s = [200, 1000, 10000],
321 float32_t treat_scaling_factor = 1.0, list ctrl_scaling_factor_s = [1.0, 0.2, 0.02],
322 bool stderr_on = False,
323 float32_t pseudocount = 1,
324 int32_t end_shift = 0,
325 float32_t lambda_bg = 0,
326 bool save_bedGraph = False,
327 str bedGraph_filename_prefix = "PREFIX",
328 str bedGraph_treat_filename = "TREAT.bdg",
329 str bedGraph_control_filename = "CTRL.bdg",
330 str cutoff_analysis_filename = "TMP.txt",
331 bool save_SPMR = False ):
332 """Initialize.
334 A calculator is unique to each comparison of treat and
335 control. Treat_depth and ctrl_depth should not be changed
336 during calculation.
338 treat and ctrl are either FWTrack or PETrackI objects.
340 treat_depth and ctrl_depth are effective depth in million:
341 sequencing depth in million after
342 duplicates being filtered. If
343 treatment is scaled down to
344 control sample size, then this
345 should be control sample size in
346 million. And vice versa.
348 d, sregion, lregion: d is the fragment size, sregion is the
349 small region size, lregion is the large
350 region size
352 pseudocount: a pseudocount used to calculate logLR, FE or
353 logFE. Please note this value will not be changed
354 with normalization method. So if you really want
355 to set pseudocount 1 per million reads, set it
356 after you normalize treat and control by million
357 reads by `change_normalizetion_method(ord('M'))`.
360 cdef:
361 set chr1, chr2
362 int32_t i
363 char * tmp
364 bytes tmp_bytes
365 float32_t p
366 # decide PE mode
367 if isinstance(treat, FWTrack):
368 self.PE_mode = False
369 elif isinstance(treat, PETrackI):
370 self.PE_mode = True
371 else:
372 raise Exception("Should be FWTrack or PETrackI object!")
373 # decide if there is control
374 self.treat = treat
375 if ctrl:
376 self.ctrl = ctrl
377 else: # while there is no control
378 self.ctrl = treat
379 self.trackline = False
380 self.d = d # note, self.d doesn't make sense in PE mode
381 self.ctrl_d_s = ctrl_d_s# note, self.d doesn't make sense in PE mode
382 self.treat_scaling_factor = treat_scaling_factor
383 self.ctrl_scaling_factor_s= ctrl_scaling_factor_s
384 self.end_shift = end_shift
385 self.lambda_bg = lambda_bg
386 self.pqtable = Float32to32Map( for_int = False ) # Float32 -> Float32 map
387 self.save_bedGraph = save_bedGraph
388 self.save_SPMR = save_SPMR
389 self.bedGraph_filename_prefix = bedGraph_filename_prefix.encode()
390 self.bedGraph_treat_filename = bedGraph_treat_filename.encode()
391 self.bedGraph_control_filename = bedGraph_control_filename.encode()
392 if not self.ctrl_d_s or not self.ctrl_scaling_factor_s:
393 self.no_lambda_flag = True
394 else:
395 self.no_lambda_flag = False
396 self.pseudocount = pseudocount
397 # get the common chromosome names from both treatment and control
398 chr1 = set(self.treat.get_chr_names())
399 chr2 = set(self.ctrl.get_chr_names())
400 self.chromosomes = sorted(list(chr1.intersection(chr2)))
402 self.pileup_data_files = {}
403 self.pvalue_length = {}
404 self.pvalue_npeaks = {}
405 for p in np.arange( 0.3, 10, 0.3 ): # step for optimal cutoff is 0.3 in -log10pvalue, we try from pvalue 1E-10 (-10logp=10) to 0.5 (-10logp=0.3)
406 self.pvalue_length[ p ] = 0
407 self.pvalue_npeaks[ p ] = 0
408 self.optimal_p_cutoff = 0
409 self.cutoff_analysis_filename = cutoff_analysis_filename.encode()
411 cpdef destroy ( self ):
412 """Remove temporary files for pileup values of each chromosome.
414 Note: This function MUST be called if the class object won't
415 be used anymore.
418 cdef:
419 bytes f
421 for f in self.pileup_data_files.values():
422 if os.path.isfile( f ):
423 os.unlink( f )
424 return
426 cpdef set_pseudocount( self, float32_t pseudocount ):
427 self.pseudocount = pseudocount
429 cpdef enable_trackline( self ):
430 """Turn on trackline with bedgraph output
432 self.trackline = True
434 cdef __pileup_treat_ctrl_a_chromosome ( self, bytes chrom ):
435 """After this function is called, self.chr_pos_treat_ctrl will
436 be reset and assigned to the pileup values of the given
437 chromosome.
440 cdef:
441 list treat_pv, ctrl_pv
442 int64_t i
443 float32_t t
444 object f
445 str temp_filename
447 assert chrom in self.chromosomes, "chromosome %s is not valid." % chrom
449 # check backup file of pileup values. If not exists, create
450 # it. Otherwise, load them instead of calculating new pileup
451 # values.
452 if chrom in self.pileup_data_files:
453 try:
454 f = open( self.pileup_data_files[ chrom ],"rb" )
455 self.chr_pos_treat_ctrl = cPickle.load( f )
456 f.close()
457 return
458 except:
459 temp_fd, temp_filename = mkstemp()
460 os.close(temp_fd)
461 self.pileup_data_files[ chrom ] = temp_filename
462 else:
463 temp_fd, temp_filename = mkstemp()
464 os.close(temp_fd)
465 self.pileup_data_files[ chrom ] = temp_filename.encode()
467 # reset or clean existing self.chr_pos_treat_ctrl
468 if self.chr_pos_treat_ctrl: # not a beautiful way to clean
469 clean_up_ndarray( self.chr_pos_treat_ctrl[0] )
470 clean_up_ndarray( self.chr_pos_treat_ctrl[1] )
471 clean_up_ndarray( self.chr_pos_treat_ctrl[2] )
473 if self.PE_mode:
474 treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 )
475 else:
476 treat_pv = self.treat.pileup_a_chromosome( chrom, [self.d,], [self.treat_scaling_factor,], baseline_value = 0.0,
477 directional = True,
478 end_shift = self.end_shift )
480 if not self.no_lambda_flag:
481 if self.PE_mode:
482 # note, we pileup up PE control as SE control because
483 # we assume the bias only can be captured at the
484 # surrounding regions of cutting sites from control experiments.
485 ctrl_pv = self.ctrl.pileup_a_chromosome_c( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg )
486 else:
487 ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s,
488 baseline_value = self.lambda_bg,
489 directional = False )
490 else:
491 ctrl_pv = [treat_pv[0][-1:], np.array([self.lambda_bg,], dtype="float32")] # set a global lambda
493 self.chr_pos_treat_ctrl = self.__chrom_pair_treat_ctrl( treat_pv, ctrl_pv)
495 # clean treat_pv and ctrl_pv
496 treat_pv = []
497 ctrl_pv = []
499 # save data to temporary file
500 try:
501 f = open(self.pileup_data_files[ chrom ],"wb")
502 cPickle.dump( self.chr_pos_treat_ctrl, f , protocol=2 )
503 f.close()
504 except:
505 # fail to write then remove the key in pileup_data_files
506 self.pileup_data_files.pop(chrom)
507 return
509 cdef list __chrom_pair_treat_ctrl ( self, treat_pv, ctrl_pv ):
510 """*private* Pair treat and ctrl pileup for each region.
512 treat_pv and ctrl_pv are [np.ndarray, np.ndarray].
514 return [p, t, c] list, each element is a numpy array.
516 cdef:
517 list ret
518 int64_t pre_p, index_ret, it, ic, lt, lc
519 np.ndarray[np.int32_t, ndim=1] t_p, c_p, ret_p
520 np.ndarray[np.float32_t, ndim=1] t_v, c_v, ret_t, ret_c
522 int32_t * t_p_ptr
523 int32_t * c_p_ptr
524 int32_t * ret_p_ptr
526 float32_t * t_v_ptr
527 float32_t * c_v_ptr
528 float32_t * ret_t_ptr
529 float32_t * ret_c_ptr
531 [ t_p, t_v ] = treat_pv
532 [ c_p, c_v ] = ctrl_pv
534 lt = t_p.shape[0]
535 lc = c_p.shape[0]
537 chrom_max_len = lt + lc
539 ret_p = np.zeros( chrom_max_len, dtype="int32" ) # position
540 ret_t = np.zeros( chrom_max_len, dtype="float32" ) # value from treatment
541 ret_c = np.zeros( chrom_max_len, dtype="float32" ) # value from control
543 t_p_ptr = <int32_t *> t_p.data
544 t_v_ptr = <float32_t *> t_v.data
545 c_p_ptr = <int32_t *> c_p.data
546 c_v_ptr = <float32_t *> c_v.data
547 ret_p_ptr = <int32_t *> ret_p.data
548 ret_t_ptr = <float32_t *>ret_t.data
549 ret_c_ptr = <float32_t *>ret_c.data
551 pre_p = 0
552 index_ret = 0
553 it = 0
554 ic = 0
556 while it < lt and ic < lc:
557 if t_p_ptr[0] < c_p_ptr[0]:
558 # clip a region from pre_p to p1, then set pre_p as p1.
559 ret_p_ptr[0] = t_p_ptr[0]
560 ret_t_ptr[0] = t_v_ptr[0]
561 ret_c_ptr[0] = c_v_ptr[0]
562 ret_p_ptr += 1
563 ret_t_ptr += 1
564 ret_c_ptr += 1
565 pre_p = t_p_ptr[0]
566 index_ret += 1
567 # call for the next p1 and v1
568 it += 1
569 t_p_ptr += 1
570 t_v_ptr += 1
571 elif t_p_ptr[0] > c_p_ptr[0]:
572 # clip a region from pre_p to p2, then set pre_p as p2.
573 ret_p_ptr[0] = c_p_ptr[0]
574 ret_t_ptr[0] = t_v_ptr[0]
575 ret_c_ptr[0] = c_v_ptr[0]
576 ret_p_ptr += 1
577 ret_t_ptr += 1
578 ret_c_ptr += 1
579 pre_p = c_p_ptr[0]
580 index_ret += 1
581 # call for the next p2 and v2
582 ic += 1
583 c_p_ptr += 1
584 c_v_ptr += 1
585 else:
586 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
587 ret_p_ptr[0] = t_p_ptr[0]
588 ret_t_ptr[0] = t_v_ptr[0]
589 ret_c_ptr[0] = c_v_ptr[0]
590 ret_p_ptr += 1
591 ret_t_ptr += 1
592 ret_c_ptr += 1
593 pre_p = t_p_ptr[0]
594 index_ret += 1
595 # call for the next p1, v1, p2, v2.
596 it += 1
597 ic += 1
598 t_p_ptr += 1
599 t_v_ptr += 1
600 c_p_ptr += 1
601 c_v_ptr += 1
603 ret_p.resize( index_ret, refcheck=False)
604 ret_t.resize( index_ret, refcheck=False)
605 ret_c.resize( index_ret, refcheck=False)
606 return [ret_p, ret_t, ret_c]
608 cdef np.ndarray __cal_score ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2, cal_func ):
609 cdef:
610 int64_t i
611 np.ndarray[np.float32_t, ndim=1] s
612 assert array1.shape[0] == array2.shape[0]
613 s = np.zeros(array1.shape[0], dtype="float32")
614 for i in range(array1.shape[0]):
615 s[i] = cal_func( array1[i], array2[i] )
616 return s
618 cdef void __cal_pvalue_qvalue_table ( self ):
619 """After this function is called, self.pqtable is built. All
620 chromosomes will be iterated. So it will take some time.
623 cdef:
624 bytes chrom
625 np.ndarray pos_array, treat_array, ctrl_array, score_array
626 dict pscore_stat
627 int64_t n, pre_p, length, pre_l, l, i, j
628 float32_t this_v, pre_v, v, q, pre_q
629 int64_t N, k, this_l
630 float32_t f
631 list unique_values
632 int32_t * pos_ptr
633 float32_t * treat_value_ptr
634 float32_t * ctrl_value_ptr
636 debug ( "Start to calculate pvalue stat..." )
638 pscore_stat = {} #dict()
639 for i in range( len( self.chromosomes ) ):
640 chrom = self.chromosomes[ i ]
641 pre_p = 0
643 self.__pileup_treat_ctrl_a_chromosome( chrom )
644 [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
646 pos_ptr = <int32_t *> pos_array.data
647 treat_value_ptr = <float32_t *> treat_array.data
648 ctrl_value_ptr = <float32_t *> ctrl_array.data
650 for j in range(pos_array.shape[0]):
651 this_v = get_pscore( (<int32_t>(treat_value_ptr[0]), ctrl_value_ptr[0] ) )
652 this_l = pos_ptr[0] - pre_p
653 if this_v in pscore_stat:
654 pscore_stat[ this_v ] += this_l
655 else:
656 pscore_stat[ this_v ] = this_l
657 pre_p = pos_ptr[0]
658 pos_ptr += 1
659 treat_value_ptr += 1
660 ctrl_value_ptr += 1
662 N = sum(pscore_stat.values()) # total length
663 k = 1 # rank
664 f = -log10(N)
665 pre_v = -2147483647
666 pre_l = 0
667 pre_q = 2147483647 # save the previous q-value
669 self.pqtable = Float32to32Map( for_int = False )
670 unique_values = sorted(list(pscore_stat.keys()), reverse=True)
671 for i in range(len(unique_values)):
672 v = unique_values[i]
673 l = pscore_stat[v]
674 q = v + (log10(k) + f)
675 if q > pre_q:
676 q = pre_q
677 if q <= 0:
678 q = 0
679 break
680 #q = max(0,min(pre_q,q)) # make q-score monotonic
681 self.pqtable[ v ] = q
682 pre_q = q
683 k += l
684 # bottom rank pscores all have qscores 0
685 for j in range(i, len(unique_values) ):
686 v = unique_values[ j ]
687 self.pqtable[ v ] = 0
688 return
690 cdef void __pre_computes ( self, int32_t max_gap = 50, int32_t min_length = 200 ):
691 """After this function is called, self.pqtable and self.pvalue_length is built. All
692 chromosomes will be iterated. So it will take some time.
695 cdef:
696 bytes chrom
697 np.ndarray pos_array, treat_array, ctrl_array, score_array
698 dict pscore_stat
699 int64_t n, pre_p, this_p, length, j, pre_l, l, i
700 float32_t q, pre_q, this_t, this_c
701 float32_t this_v, pre_v, v, cutoff
702 int64_t N, k, this_l
703 float32_t f
704 list unique_values
705 float64_t t0, t1, t
707 np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos
708 list peak_content
709 int64_t peak_length, total_l, total_p
711 list tmplist
713 int32_t * acs_ptr # above cutoff start position pointer
714 int32_t * ace_ptr # above cutoff end position pointer
715 int32_t * pos_array_ptr # position array pointer
716 float32_t * score_array_ptr # score array pointer
718 debug ( "Start to calculate pvalue stat..." )
720 # tmplist contains a list of log pvalue cutoffs from 0.3 to 10
721 tmplist = [round(x,5) for x in sorted( list(np.arange(0.3, 10.0, 0.3)), reverse = True )]
723 pscore_stat = {} #dict()
724 #print (list(pscore_stat.keys()))
725 #print (list(self.pvalue_length.keys()))
726 #print (list(self.pvalue_npeaks.keys()))
727 for i in range( len( self.chromosomes ) ):
728 chrom = self.chromosomes[ i ]
729 self.__pileup_treat_ctrl_a_chromosome( chrom )
730 [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
732 score_array = self.__cal_pscore( treat_array, ctrl_array )
734 for n in range( len( tmplist ) ):
735 cutoff = tmplist[ n ]
736 total_l = 0 # total length in potential peak
737 total_p = 0
739 # get the regions with scores above cutoffs
740 above_cutoff = np.nonzero( score_array > cutoff )[0]# this is not an optimized method. It would be better to store score array in a 2-D ndarray?
741 above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
742 above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
744 if above_cutoff_endpos.size == 0:
745 continue
747 # first bit of region above cutoff
748 acs_ptr = <int32_t *> above_cutoff_startpos.data
749 ace_ptr = <int32_t *> above_cutoff_endpos.data
751 peak_content = [( acs_ptr[ 0 ], ace_ptr[ 0 ] ), ]
752 lastp = ace_ptr[ 0 ]
753 acs_ptr += 1
754 ace_ptr += 1
756 for i in range( 1, above_cutoff_startpos.size ):
757 tl = acs_ptr[ 0 ] - lastp
758 if tl <= max_gap:
759 peak_content.append( ( acs_ptr[ 0 ], ace_ptr[ 0 ] ) )
760 else:
761 peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
762 if peak_length >= min_length: # if the peak is too small, reject it
763 total_l += peak_length
764 total_p += 1
765 peak_content = [ ( acs_ptr[ 0 ], ace_ptr[ 0 ] ), ]
766 lastp = ace_ptr[ 0 ]
767 acs_ptr += 1
768 ace_ptr += 1
770 if peak_content:
771 peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
772 if peak_length >= min_length: # if the peak is too small, reject it
773 total_l += peak_length
774 total_p += 1
775 self.pvalue_length[ cutoff ] = self.pvalue_length.get( cutoff, 0 ) + total_l
776 self.pvalue_npeaks[ cutoff ] = self.pvalue_npeaks.get( cutoff, 0 ) + total_p
778 pos_array_ptr = <int32_t *> pos_array.data
779 score_array_ptr = <float32_t *> score_array.data
781 pre_p = 0
782 for i in range(pos_array.shape[0]):
783 this_p = pos_array_ptr[ 0 ]
784 this_l = this_p - pre_p
785 this_v = score_array_ptr[ 0 ]
786 if this_v in pscore_stat:
787 pscore_stat[ this_v ] += this_l
788 else:
789 pscore_stat[ this_v ] = this_l
790 pre_p = this_p #pos_array[ i ]
791 pos_array_ptr += 1
792 score_array_ptr += 1
794 #debug ( "make pscore_stat cost %.5f seconds" % t )
796 # add all pvalue cutoffs from cutoff-analysis part. So that we
797 # can get the corresponding qvalues for them.
798 for cutoff in tmplist:
799 if cutoff not in pscore_stat:
800 pscore_stat[ cutoff ] = 0
802 nhval = 0
804 N = sum(pscore_stat.values()) # total length
805 k = 1 # rank
806 f = -log10(N)
807 pre_v = -2147483647
808 pre_l = 0
809 pre_q = 2147483647 # save the previous q-value
811 self.pqtable = Float32to32Map( for_int = False ) #{}
812 unique_values = sorted(list(pscore_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
813 for i in range(len(unique_values)):
814 v = unique_values[i]
815 l = pscore_stat[v]
816 q = v + (log10(k) + f)
817 if q > pre_q:
818 q = pre_q
819 if q <= 0:
820 q = 0
821 break
822 #q = max(0,min(pre_q,q)) # make q-score monotonic
823 self.pqtable[ v ] = q
824 pre_v = v
825 pre_q = q
826 k+=l
827 for j in range(i, len(unique_values) ):
828 v = unique_values[ j ]
829 self.pqtable[ v ] = 0
831 # write pvalue and total length of predicted peaks
832 # this is the output from cutoff-analysis
833 fhd = open( self.cutoff_analysis_filename, "w" )
834 fhd.write( "pscore\tqscore\tnpeaks\tlpeaks\tavelpeak\n" )
835 x = []
836 y = []
837 for cutoff in tmplist:
838 if self.pvalue_npeaks[ cutoff ] > 0:
839 fhd.write( "%.2f\t%.2f\t%d\t%d\t%.2f\n" % ( cutoff, self.pqtable[ cutoff ], self.pvalue_npeaks[ cutoff ], self.pvalue_length[ cutoff ], self.pvalue_length[ cutoff ]/self.pvalue_npeaks[ cutoff ] ) )
840 x.append( cutoff )
841 y.append( self.pvalue_length[ cutoff ] )
842 fhd.close()
843 info( "#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename )
844 #info( "#3 Suggest a cutoff..." )
845 #optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
846 #info( "#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
847 #print (list(pqtable.keys()))
848 #print (list(self.pvalue_length.keys()))
849 #print (list(self.pvalue_npeaks.keys()))
850 return
852 cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int32_t min_length = 200,
853 int32_t max_gap = 50, bool call_summits = False, bool cutoff_analysis = False ):
854 """Call peaks for all chromosomes. Return a PeakIO object.
856 scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
857 score_cutoff_s : cutoff values corresponding to scoring functions
858 min_length : minimum length of peak
859 max_gap : maximum gap of 'insignificant' regions within a peak. Note, for PE_mode, max_gap and max_length are both set as fragment length.
860 call_summits : boolean. Whether or not call sub-peaks.
861 save_bedGraph : whether or not to save pileup and control into a bedGraph file
863 cdef:
864 bytes chrom
865 bytes tmp_bytes
867 peaks = PeakIO()
869 # prepare p-q table
870 if len( self.pqtable ) == 0:
871 info("#3 Pre-compute pvalue-qvalue table...")
872 if cutoff_analysis:
873 info("#3 Cutoff vs peaks called will be analyzed!")
874 self.__pre_computes( max_gap = max_gap, min_length = min_length )
875 else:
876 self.__cal_pvalue_qvalue_table()
879 # prepare bedGraph file
880 if self.save_bedGraph:
881 self.bedGraph_treat_f = fopen( self.bedGraph_treat_filename, "w" )
882 self.bedGraph_ctrl_f = fopen( self.bedGraph_control_filename, "w" )
884 info ("#3 In the peak calling step, the following will be performed simultaneously:")
885 info ("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
886 info ("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")
888 if self.save_SPMR:
889 info ( "#3 --SPMR is requested, so pileup will be normalized by sequencing depth in million reads." )
890 elif self.treat_scaling_factor == 1:
891 info ( "#3 Pileup will be based on sequencing depth in treatment." )
892 else:
893 info ( "#3 Pileup will be based on sequencing depth in control." )
895 if self.trackline:
896 # this line is REQUIRED by the wiggle format for UCSC browser
897 tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
898 fprintf( self.bedGraph_treat_f, tmp_bytes )
899 tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
900 fprintf( self.bedGraph_ctrl_f, tmp_bytes )
902 info("#3 Call peaks for each chromosome...")
903 for chrom in self.chromosomes:
904 # treat/control bedGraph will be saved if requested by user.
905 self.__chrom_call_peak_using_certain_criteria ( peaks, chrom, scoring_function_symbols, score_cutoff_s, min_length, max_gap, call_summits, self.save_bedGraph )
907 # close bedGraph file
908 if self.save_bedGraph:
909 fclose(self.bedGraph_treat_f)
910 fclose(self.bedGraph_ctrl_f)
911 self.save_bedGraph = False
913 return peaks
915 cdef void __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int32_t min_length,
916 int32_t max_gap, bool call_summits, bool save_bedGraph ):
917 """ Call peaks for a chromosome.
919 Combination of criteria is allowed here.
921 peaks: a PeakIO object, the return value of this function
922 scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
923 save_bedGraph : whether or not to save pileup and control into a bedGraph file
925 cdef:
926 float64_t t0
927 int32_t i, n
928 str s
929 np.ndarray above_cutoff
930 np.ndarray[np.int32_t, ndim=1] above_cutoff_endpos, above_cutoff_startpos, pos_array, above_cutoff_index_array
932 np.ndarray[np.float32_t, ndim=1] treat_array, ctrl_array
933 list score_array_s # list to keep different types of scores
934 list peak_content # to store information for a
935 # chunk in a peak region, it
936 # contains lists of: 1. left
937 # position; 2. right
938 # position; 3. treatment
939 # value; 4. control value;
940 # 5. list of scores at this
941 # chunk
942 int64_t tl, lastp, ts, te, ti
943 float32_t tp, cp
944 int32_t * acs_ptr
945 int32_t * ace_ptr
946 int32_t * acia_ptr
947 float32_t * treat_array_ptr
948 float32_t * ctrl_array_ptr
951 assert len(scoring_function_s) == len(score_cutoff_s), "number of functions and cutoffs should be the same!"
953 peak_content = [] # to store points above cutoff
955 # first, build pileup, self.chr_pos_treat_ctrl
956 # this step will be speeped up if pqtable is pre-computed.
957 self.__pileup_treat_ctrl_a_chromosome( chrom )
958 [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
960 # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
961 if save_bedGraph:
962 self.__write_bedGraph_for_a_chromosome ( chrom )
964 # keep all types of scores needed
965 #t0 = ttime()
966 score_array_s = []
967 for i in range(len(scoring_function_s)):
968 s = scoring_function_s[i]
969 if s == 'p':
970 score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) )
971 elif s == 'q':
972 score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) )
973 elif s == 'f':
974 score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) )
975 elif s == 's':
976 score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) )
978 # get the regions with scores above cutoffs
979 above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,score_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
980 above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
981 above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
982 above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
984 if above_cutoff.size == 0:
985 # nothing above cutoff
986 return
988 if above_cutoff[0] == 0:
989 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
990 above_cutoff_startpos[0] = 0
992 #print "apply cutoff -- chrom:",chrom," time:", ttime() - t0
993 # start to build peak regions
994 #t0 = ttime()
996 # first bit of region above cutoff
997 acs_ptr = <int32_t *>above_cutoff_startpos.data
998 ace_ptr = <int32_t *>above_cutoff_endpos.data
999 acia_ptr= <int32_t *>above_cutoff_index_array.data
1000 treat_array_ptr = <float32_t *> treat_array.data
1001 ctrl_array_ptr = <float32_t *> ctrl_array.data
1003 ts = acs_ptr[ 0 ]
1004 te = ace_ptr[ 0 ]
1005 ti = acia_ptr[ 0 ]
1006 tp = treat_array_ptr[ ti ]
1007 cp = ctrl_array_ptr[ ti ]
1009 peak_content.append( ( ts, te, tp, cp, ti ) )
1010 lastp = te
1011 acs_ptr += 1
1012 ace_ptr += 1
1013 acia_ptr+= 1
1015 for i in range( 1, above_cutoff_startpos.shape[0] ):
1016 ts = acs_ptr[ 0 ]
1017 te = ace_ptr[ 0 ]
1018 ti = acia_ptr[ 0 ]
1019 acs_ptr += 1
1020 ace_ptr += 1
1021 acia_ptr+= 1
1022 tp = treat_array_ptr[ ti ]
1023 cp = ctrl_array_ptr[ ti ]
1024 tl = ts - lastp
1025 if tl <= max_gap:
1026 # append.
1027 peak_content.append( ( ts, te, tp, cp, ti ) )
1028 lastp = te #above_cutoff_endpos[i]
1029 else:
1030 # close
1031 if call_summits:
1032 self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
1033 else:
1034 self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
1035 peak_content = [ ( ts, te, tp, cp, ti ), ]
1036 lastp = te #above_cutoff_endpos[i]
1037 # save the last peak
1038 if not peak_content:
1039 return
1040 else:
1041 if call_summits:
1042 self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
1043 else:
1044 self.__close_peak_wo_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
1046 #print "close peaks -- chrom:",chrom," time:", ttime() - t0
1047 return
1049 cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int32_t min_length,
1050 bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[]):
1051 """Close the peak region, output peak boundaries, peak summit
1052 and scores, then add the peak to peakIO object.
1054 peak_content contains [start, end, treat_p, ctrl_p, index_in_score_array]
1056 peaks: a PeakIO object
1059 cdef:
1060 int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
1061 float64_t treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
1062 int32_t tlist_scores_p
1064 peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
1065 if peak_length >= min_length: # if the peak is too small, reject it
1066 tsummit = []
1067 summit_pos = 0
1068 summit_value = 0
1069 for i in range(len(peak_content)):
1070 (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
1071 tscore = ttreat_p # use pscore as general score to find summit
1072 if not summit_value or summit_value < tscore:
1073 tsummit = [(tend + tstart) // 2, ]
1074 tsummit_index = [ i, ]
1075 summit_value = tscore
1076 elif summit_value == tscore:
1077 # remember continuous summit values
1078 tsummit.append((tend + tstart) // 2)
1079 tsummit_index.append( i )
1080 # the middle of all highest points in peak region is defined as summit
1081 midindex = (len(tsummit) + 1) // 2 - 1
1082 summit_pos = tsummit[ midindex ]
1083 summit_index = tsummit_index[ midindex ]
1085 summit_treat = peak_content[ summit_index ][ 2 ]
1086 summit_ctrl = peak_content[ summit_index ][ 3 ]
1088 # this is a double-check to see if the summit can pass cutoff values.
1089 for i in range(len(score_cutoff_s)):
1090 if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
1091 return False # not passed, then disgard this peak.
1093 summit_p_score = pscore_dict[ ( <int32_t>(summit_treat), summit_ctrl ) ] #get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
1094 summit_q_score = self.pqtable[ summit_p_score ]
1096 peaks.add( chrom, # chromosome
1097 peak_content[0][0], # start
1098 peak_content[-1][1], # end
1099 summit = summit_pos, # summit position
1100 peak_score = summit_q_score, # score at summit
1101 pileup = summit_treat, # pileup
1102 pscore = summit_p_score, # pvalue
1103 fold_change = ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change
1104 qscore = summit_q_score # qvalue
1106 # start a new peak
1107 return True
1109 cdef bool __close_peak_with_subpeaks (self, list peak_content, peaks, int32_t min_length,
1110 bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[],
1111 float32_t min_valley = 0.9 ):
1112 """Algorithm implemented by Ben, to profile the pileup signals
1113 within a peak region then find subpeak summits. This method is
1114 highly recommended for TFBS or DNAase I sites.
1117 cdef:
1118 int32_t summit_pos, tstart, tend, tmpindex, summit_index, summit_offset
1119 int32_t start, end, i, j, start_boundary, m, n, l
1120 float64_t summit_value, tvalue, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
1121 np.ndarray[np.float32_t, ndim=1] peakdata
1122 np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets
1123 int32_t tlist_scores_p
1125 peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
1127 if peak_length < min_length: return # if the region is too small, reject it
1129 # Add 10 bp padding to peak region so that we can get true minima
1130 end = peak_content[ -1 ][ 1 ] + 10
1131 start = peak_content[ 0 ][ 0 ] - 10
1132 if start < 0:
1133 start_boundary = 10 + start # this is the offset of original peak boundary in peakdata list.
1134 start = 0
1135 else:
1136 start_boundary = 10 # this is the offset of original peak boundary in peakdata list.
1138 peakdata = np.zeros(end - start, dtype='float32') # save the scores (qscore) for each position in this region
1139 peakindices = np.zeros(end - start, dtype='int32') # save the indices for each position in this region
1140 for i in range(len(peak_content)):
1141 (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
1142 tscore = ttreat_p # use pileup as general score to find summit
1143 m = tstart - start + start_boundary
1144 n = tend - start + start_boundary
1145 peakdata[m:n] = tscore
1146 peakindices[m:n] = i
1148 summit_offsets = maxima(peakdata, smoothlen) # offsets are the indices for summits in peakdata/peakindices array.
1150 if summit_offsets.shape[0] == 0:
1151 # **failsafe** if no summits, fall back on old approach #
1152 return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom, smoothlen, score_array_s, score_cutoff_s)
1153 else:
1154 # remove maxima that occurred in padding
1155 m = np.searchsorted(summit_offsets, start_boundary)
1156 n = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right')
1157 summit_offsets = summit_offsets[m:n]
1159 summit_offsets = enforce_peakyness(peakdata, summit_offsets)
1161 #print "enforced:",summit_offsets
1162 if summit_offsets.shape[0] == 0:
1163 # **failsafe** if no summits, fall back on old approach #
1164 return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom, smoothlen, score_array_s, score_cutoff_s)
1166 summit_indices = peakindices[summit_offsets] # indices are those point to peak_content
1167 summit_offsets -= start_boundary
1169 for summit_offset, summit_index in list(zip(summit_offsets, summit_indices)):
1171 summit_treat = peak_content[ summit_index ][ 2 ]
1172 summit_ctrl = peak_content[ summit_index ][ 3 ]
1174 summit_p_score = pscore_dict[ ( <int32_t>(summit_treat), summit_ctrl ) ] # get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
1175 summit_q_score = self.pqtable[ summit_p_score ]
1177 for i in range(len(score_cutoff_s)):
1178 if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
1179 return False # not passed, then disgard this summit.
1181 peaks.add( chrom,
1182 peak_content[ 0 ][ 0 ],
1183 peak_content[ -1 ][ 1 ],
1184 summit = start + summit_offset,
1185 peak_score = summit_q_score,
1186 pileup = summit_treat,
1187 pscore = summit_p_score,
1188 fold_change = (summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change
1189 qscore = summit_q_score
1191 # start a new peak
1192 return True
1194 cdef np.ndarray __cal_pscore ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
1195 cdef:
1196 int64_t i, array1_size
1197 np.ndarray[np.float32_t, ndim=1] s
1198 float32_t * a1_ptr
1199 float32_t * a2_ptr
1200 float32_t * s_ptr
1202 assert array1.shape[0] == array2.shape[0]
1203 s = np.zeros(array1.shape[0], dtype="float32")
1205 a1_ptr = <float32_t *> array1.data
1206 a2_ptr = <float32_t *> array2.data
1207 s_ptr = <float32_t *> s.data
1209 array1_size = array1.shape[0]
1211 for i in range(array1_size):
1212 s_ptr[0] = get_pscore(( <int32_t>(a1_ptr[0]), a2_ptr[0] ))
1213 s_ptr += 1
1214 a1_ptr += 1
1215 a2_ptr += 1
1216 return s
1218 cdef np.ndarray __cal_qscore ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
1219 cdef:
1220 int64_t i, array1_size
1221 np.ndarray[np.float32_t, ndim=1] s
1222 float32_t * a1_ptr
1223 float32_t * a2_ptr
1224 float32_t * s_ptr
1226 assert array1.shape[0] == array2.shape[0]
1227 s = np.zeros(array1.shape[0], dtype="float32")
1229 a1_ptr = <float32_t *> array1.data
1230 a2_ptr = <float32_t *> array2.data
1231 s_ptr = <float32_t *> s.data
1233 for i in range(array1.shape[0]):
1234 s_ptr[0] = self.pqtable[ get_pscore(( <int32_t>(a1_ptr[0]), a2_ptr[0] )) ]
1235 s_ptr += 1
1236 a1_ptr += 1
1237 a2_ptr += 1
1238 return s
1240 cdef np.ndarray __cal_logLR ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
1241 cdef:
1242 int64_t i, array1_size
1243 np.ndarray[np.float32_t, ndim=1] s
1244 float32_t * a1_ptr
1245 float32_t * a2_ptr
1246 float32_t * s_ptr
1248 assert array1.shape[0] == array2.shape[0]
1249 s = np.zeros(array1.shape[0], dtype="float32")
1251 a1_ptr = <float32_t *> array1.data
1252 a2_ptr = <float32_t *> array2.data
1253 s_ptr = <float32_t *> s.data
1255 for i in range(array1.shape[0]):
1256 s_ptr[0] = get_logLR_asym( (a1_ptr[0] + self.pseudocount, a2_ptr[0] + self.pseudocount ) )
1257 s_ptr += 1
1258 a1_ptr += 1
1259 a2_ptr += 1
1260 return s
1262 cdef np.ndarray __cal_logFE ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
1263 cdef:
1264 int64_t i, array1_size
1265 np.ndarray[np.float32_t, ndim=1] s
1266 float32_t * a1_ptr
1267 float32_t * a2_ptr
1268 float32_t * s_ptr
1270 assert array1.shape[0] == array2.shape[0]
1271 s = np.zeros(array1.shape[0], dtype="float32")
1273 a1_ptr = <float32_t *> array1.data
1274 a2_ptr = <float32_t *> array2.data
1275 s_ptr = <float32_t *> s.data
1277 for i in range(array1.shape[0]):
1278 s_ptr[0] = get_logFE( a1_ptr[0] + self.pseudocount, a2_ptr[0] + self.pseudocount )
1279 s_ptr += 1
1280 a1_ptr += 1
1281 a2_ptr += 1
1282 return s
1284 cdef np.ndarray __cal_FE ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
1285 cdef:
1286 int64_t i, array1_size
1287 np.ndarray[np.float32_t, ndim=1] s
1288 float32_t * a1_ptr
1289 float32_t * a2_ptr
1290 float32_t * s_ptr
1292 assert array1.shape[0] == array2.shape[0]
1293 s = np.zeros(array1.shape[0], dtype="float32")
1295 a1_ptr = <float32_t *> array1.data
1296 a2_ptr = <float32_t *> array2.data
1297 s_ptr = <float32_t *> s.data
1299 for i in range(array1.shape[0]):
1300 s_ptr[0] = (a1_ptr[0] + self.pseudocount) / ( a2_ptr[0] + self.pseudocount )
1301 s_ptr += 1
1302 a1_ptr += 1
1303 a2_ptr += 1
1304 return s
1306 cdef np.ndarray __cal_subtraction ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
1307 cdef:
1308 int64_t i, array1_size
1309 np.ndarray[np.float32_t, ndim=1] s
1310 float32_t * a1_ptr
1311 float32_t * a2_ptr
1312 float32_t * s_ptr
1314 assert array1.shape[0] == array2.shape[0]
1315 s = np.zeros(array1.shape[0], dtype="float32")
1317 a1_ptr = <float32_t *> array1.data
1318 a2_ptr = <float32_t *> array2.data
1319 s_ptr = <float32_t *> s.data
1321 for i in range(array1.shape[0]):
1322 s_ptr[0] = a1_ptr[0] - a2_ptr[0]
1323 s_ptr += 1
1324 a1_ptr += 1
1325 a2_ptr += 1
1326 return s
1329 cdef bool __write_bedGraph_for_a_chromosome ( self, bytes chrom ):
1330 """Write treat/control values for a certain chromosome into a
1331 specified file handler.
1334 cdef:
1335 np.ndarray[np.int32_t, ndim=1] pos_array
1336 np.ndarray[np.float32_t, ndim=1] treat_array, ctrl_array
1337 int32_t * pos_array_ptr
1338 float32_t * treat_array_ptr
1339 float32_t * ctrl_array_ptr
1340 int32_t l, i
1341 int32_t p, pre_p_t, pre_p_c # current position, previous position for treat, previous position for control
1342 float32_t pre_v_t, pre_v_c, v_t, v_c # previous value for treat, for control, current value for treat, for control
1343 float32_t denominator # 1 if save_SPMR is false, or depth in million if save_SPMR is true. Note, while piling up and calling peaks, treatment and control have been scaled to the same depth, so we need to find what this 'depth' is.
1344 FILE * ft
1345 FILE * fc
1346 basestring tmp_bytes
1348 [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
1349 pos_array_ptr = <int32_t *> pos_array.data
1350 treat_array_ptr = <float32_t *> treat_array.data
1351 ctrl_array_ptr = <float32_t *> ctrl_array.data
1353 if self.save_SPMR:
1354 if self.treat_scaling_factor == 1:
1355 # in this case, control has been asked to be scaled to depth of treatment
1356 denominator = self.treat.total/1e6
1357 else:
1358 # in this case, treatment has been asked to be scaled to depth of control
1359 denominator = self.ctrl.total/1e6
1360 else:
1361 denominator = 1.0
1363 l = pos_array.shape[ 0 ]
1365 if l == 0: # if there is no data, return
1366 return False
1368 ft = self.bedGraph_treat_f
1369 fc = self.bedGraph_ctrl_f
1370 #t_write_func = self.bedGraph_treat.write
1371 #c_write_func = self.bedGraph_ctrl.write
1373 pre_p_t = 0
1374 pre_p_c = 0
1375 pre_v_t = treat_array_ptr[ 0 ]/denominator
1376 pre_v_c = ctrl_array_ptr [ 0 ]/denominator
1377 treat_array_ptr += 1
1378 ctrl_array_ptr += 1
1380 for i in range( 1, l ):
1381 v_t = treat_array_ptr[ 0 ]/denominator
1382 v_c = ctrl_array_ptr [ 0 ]/denominator
1383 p = pos_array_ptr [ 0 ]
1384 pos_array_ptr += 1
1385 treat_array_ptr += 1
1386 ctrl_array_ptr += 1
1388 if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits
1389 fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
1390 pre_v_t = v_t
1391 pre_p_t = p
1393 if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits
1394 fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
1395 pre_v_c = v_c
1396 pre_p_c = p
1398 p = pos_array_ptr[ 0 ]
1399 # last one
1400 fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
1401 fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
1403 return True
1405 cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400, bool cutoff_analysis = False):
1406 """This function try to find enriched regions within which,
1407 scores are continuously higher than a given cutoff for level
1408 1, and link them using the gap above level 2 cutoff with a
1409 maximum length of lvl2_max_gap.
1411 scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
1413 lvl1_cutoff_s: list of cutoffs at highly enriched regions, corresponding to scoring functions.
1414 lvl2_cutoff_s: list of cutoffs at less enriched regions, corresponding to scoring functions.
1415 min_length : minimum peak length, default 200.
1416 lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50.
1417 lvl2_max_gap : maximum length of linkage regions, default 400.
1419 Return both general PeakIO object for highly enriched regions
1420 and gapped broad regions in BroadPeakIO.
1422 cdef:
1423 int32_t i, j
1424 bytes chrom
1425 object lvl1peaks, lvl1peakschrom, lvl1
1426 object lvl2peaks, lvl2peakschrom, lvl2
1427 object broadpeaks
1428 set chrs
1429 list tmppeakset
1431 lvl1peaks = PeakIO()
1432 lvl2peaks = PeakIO()
1434 # prepare p-q table
1435 if len( self.pqtable ) == 0:
1436 info("#3 Pre-compute pvalue-qvalue table...")
1437 if cutoff_analysis:
1438 info("#3 Cutoff value vs broad region calls will be analyzed!")
1439 self.__pre_computes( max_gap = lvl2_max_gap, min_length = min_length )
1440 else:
1441 self.__cal_pvalue_qvalue_table()
1443 # prepare bedGraph file
1444 if self.save_bedGraph:
1446 self.bedGraph_treat_f = fopen( self.bedGraph_treat_filename, "w" )
1447 self.bedGraph_ctrl_f = fopen( self.bedGraph_control_filename, "w" )
1448 info ("#3 In the peak calling step, the following will be performed simultaneously:")
1449 info ("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
1450 info ("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")
1452 if self.trackline:
1453 # this line is REQUIRED by the wiggle format for UCSC browser
1454 tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
1455 fprintf( self.bedGraph_treat_f, tmp_bytes )
1456 tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
1457 fprintf( self.bedGraph_ctrl_f, tmp_bytes )
1460 info("#3 Call peaks for each chromosome...")
1461 for chrom in self.chromosomes:
1462 self.__chrom_call_broadpeak_using_certain_criteria ( lvl1peaks, lvl2peaks, chrom, scoring_function_symbols, lvl1_cutoff_s, lvl2_cutoff_s, min_length, lvl1_max_gap, lvl2_max_gap, self.save_bedGraph )
1464 # close bedGraph file
1465 if self.save_bedGraph:
1466 fclose( self.bedGraph_treat_f )
1467 fclose( self.bedGraph_ctrl_f )
1468 #self.bedGraph_ctrl.close()
1469 self.save_bedGraph = False
1471 # now combine lvl1 and lvl2 peaks
1472 chrs = lvl1peaks.get_chr_names()
1473 broadpeaks = BroadPeakIO()
1474 # use lvl2_peaks as linking regions between lvl1_peaks
1475 for chrom in sorted(chrs):
1476 lvl1peakschrom = lvl1peaks.get_data_from_chrom(chrom)
1477 lvl2peakschrom = lvl2peaks.get_data_from_chrom(chrom)
1478 lvl1peakschrom_next = iter(lvl1peakschrom).__next__
1479 tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region
1480 # our assumption is lvl1 regions should be included in lvl2 regions
1481 try:
1482 lvl1 = lvl1peakschrom_next()
1483 for i in range( len(lvl2peakschrom) ):
1484 # for each lvl2 peak, find all lvl1 peaks inside
1485 # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
1486 lvl2 = lvl2peakschrom[i]
1488 while True:
1489 if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
1490 tmppeakset.append(lvl1)
1491 lvl1 = lvl1peakschrom_next()
1492 else:
1493 # make a hierarchical broad peak
1494 #print lvl2["start"], lvl2["end"], lvl2["score"]
1495 self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
1496 tmppeakset = []
1497 break
1498 except StopIteration:
1499 # no more strong (aka lvl1) peaks left
1500 self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
1501 tmppeakset = []
1502 # add the rest lvl2 peaks
1503 for j in range( i+1, len(lvl2peakschrom) ):
1504 self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset )
1506 return broadpeaks
1508 cdef void __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
1509 int32_t min_length, int32_t lvl1_max_gap, int32_t lvl2_max_gap, bool save_bedGraph):
1510 """ Call peaks for a chromosome.
1512 Combination of criteria is allowed here.
1514 peaks: a PeakIO object
1515 scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
1516 save_bedGraph : whether or not to save pileup and control into a bedGraph file
1518 cdef:
1519 int32_t i
1520 str s
1521 np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos
1522 np.ndarray pos_array, treat_array, ctrl_array
1523 np.ndarray above_cutoff_index_array
1524 list score_array_s # list to keep different types of scores
1525 list peak_content
1526 int32_t * acs_ptr
1527 int32_t * ace_ptr
1528 int32_t * acia_ptr
1529 float32_t * treat_array_ptr
1530 float32_t * ctrl_array_ptr
1532 assert len(scoring_function_s) == len(lvl1_cutoff_s), "number of functions and cutoffs should be the same!"
1533 assert len(scoring_function_s) == len(lvl2_cutoff_s), "number of functions and cutoffs should be the same!"
1535 # first, build pileup, self.chr_pos_treat_ctrl
1536 self.__pileup_treat_ctrl_a_chromosome( chrom )
1537 [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
1539 # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
1540 if save_bedGraph:
1541 self.__write_bedGraph_for_a_chromosome ( chrom )
1543 # keep all types of scores needed
1544 score_array_s = []
1545 for i in range(len(scoring_function_s)):
1546 s = scoring_function_s[i]
1547 if s == 'p':
1548 score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) )
1549 elif s == 'q':
1550 score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) )
1551 elif s == 'f':
1552 score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) )
1553 elif s == 's':
1554 score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) )
1556 # lvl1 : strong peaks
1557 peak_content = [] # to store points above cutoff
1559 # get the regions with scores above cutoffs
1560 above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl1_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
1561 above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
1562 above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
1563 above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
1565 if above_cutoff.size == 0:
1566 # nothing above cutoff
1567 return
1569 if above_cutoff[0] == 0:
1570 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1571 above_cutoff_startpos[0] = 0
1573 # first bit of region above cutoff
1574 acs_ptr = <int32_t *>above_cutoff_startpos.data
1575 ace_ptr = <int32_t *>above_cutoff_endpos.data
1576 acia_ptr= <int32_t *>above_cutoff_index_array.data
1577 treat_array_ptr = <float32_t *> treat_array.data
1578 ctrl_array_ptr = <float32_t *> ctrl_array.data
1580 ts = acs_ptr[ 0 ]
1581 te = ace_ptr[ 0 ]
1582 ti = acia_ptr[ 0 ]
1583 tp = treat_array_ptr[ ti ]
1584 cp = ctrl_array_ptr[ ti ]
1586 peak_content.append( ( ts, te, tp, cp, ti ) )
1587 acs_ptr += 1 # move ptr
1588 ace_ptr += 1
1589 acia_ptr+= 1
1590 lastp = te
1592 #peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], score_array_s, above_cutoff_index_array[0] ) )
1593 for i in range( 1, above_cutoff_startpos.size ):
1594 ts = acs_ptr[ 0 ]
1595 te = ace_ptr[ 0 ]
1596 ti = acia_ptr[ 0 ]
1597 acs_ptr += 1
1598 ace_ptr += 1
1599 acia_ptr+= 1
1600 tp = treat_array_ptr[ ti ]
1601 cp = ctrl_array_ptr[ ti ]
1602 tl = ts - lastp
1603 if tl <= lvl1_max_gap:
1604 # append
1605 #peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i] ) )
1606 peak_content.append( ( ts, te, tp, cp, ti ) )
1607 lastp = te
1608 else:
1609 # close
1610 self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap//2, score_array_s )
1611 #peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i]) , ]
1612 peak_content = [ ( ts, te, tp, cp, ti ), ]
1613 lastp = te #above_cutoff_endpos[i]
1615 # save the last peak
1616 if peak_content:
1617 self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap//2, score_array_s )
1619 # lvl2 : weak peaks
1620 peak_content = [] # to store points above cutoff
1622 # get the regions with scores above cutoffs
1623 above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl2_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
1624 above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
1625 above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
1626 above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
1628 if above_cutoff.size == 0:
1629 # nothing above cutoff
1630 return
1632 if above_cutoff[0] == 0:
1633 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1634 above_cutoff_startpos[0] = 0
1636 # first bit of region above cutoff
1637 acs_ptr = <int32_t *>above_cutoff_startpos.data
1638 ace_ptr = <int32_t *>above_cutoff_endpos.data
1639 acia_ptr= <int32_t *>above_cutoff_index_array.data
1640 treat_array_ptr = <float32_t *> treat_array.data
1641 ctrl_array_ptr = <float32_t *> ctrl_array.data
1643 ts = acs_ptr[ 0 ]
1644 te = ace_ptr[ 0 ]
1645 ti = acia_ptr[ 0 ]
1646 tp = treat_array_ptr[ ti ]
1647 cp = ctrl_array_ptr[ ti ]
1648 peak_content.append( ( ts, te, tp, cp, ti ) )
1649 acs_ptr += 1 # move ptr
1650 ace_ptr += 1
1651 acia_ptr+= 1
1653 lastp = te
1654 for i in range( 1, above_cutoff_startpos.size ):
1655 # for everything above cutoff
1656 ts = acs_ptr[ 0 ] # get the start
1657 te = ace_ptr[ 0 ] # get the end
1658 ti = acia_ptr[ 0 ]# get the index
1660 acs_ptr += 1 # move ptr
1661 ace_ptr += 1
1662 acia_ptr+= 1
1663 tp = treat_array_ptr[ ti ] # get the treatment pileup
1664 cp = ctrl_array_ptr[ ti ] # get the control pileup
1665 tl = ts - lastp # get the distance from the current point to last position of existing peak_content
1667 if tl <= lvl2_max_gap:
1668 # append
1669 peak_content.append( ( ts, te, tp, cp, ti ) )
1670 lastp = te
1671 else:
1672 # close
1673 self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap//2, score_array_s )
1675 peak_content = [ ( ts, te, tp, cp, ti ), ]
1676 lastp = te
1678 # save the last peak
1679 if peak_content:
1680 self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap//2, score_array_s )
1682 return
1684 cdef bool __close_peak_for_broad_region (self, list peak_content, peaks, int32_t min_length,
1685 bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[]):
1686 """Close the broad peak region, output peak boundaries, peak summit
1687 and scores, then add the peak to peakIO object.
1689 peak_content contains [start, end, treat_p, ctrl_p, list_scores]
1691 peaks: a BroadPeakIO object
1694 cdef:
1695 int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
1696 float64_t treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
1697 list tlist_pileup, tlist_control, tlist_length
1698 int32_t tlist_scores_p
1699 np.ndarray tarray_pileup, tarray_control, tarray_pscore, tarray_qscore, tarray_fc
1701 peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
1702 if peak_length >= min_length: # if the peak is too small, reject it
1703 tlist_pileup = []
1704 tlist_control= []
1705 tlist_length = []
1706 for i in range(len(peak_content)): # each position in broad peak
1707 (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
1708 tlist_pileup.append( ttreat_p )
1709 tlist_control.append( tctrl_p )
1710 tlist_length.append( tend - tstart )
1712 tarray_pileup = np.array( tlist_pileup, dtype="float32")
1713 tarray_control = np.array( tlist_control, dtype="float32")
1714 tarray_pscore = self.__cal_pscore( tarray_pileup, tarray_control )
1715 tarray_qscore = self.__cal_qscore( tarray_pileup, tarray_control )
1716 tarray_fc = self.__cal_FE ( tarray_pileup, tarray_control )
1718 peaks.add( chrom, # chromosome
1719 peak_content[0][0], # start
1720 peak_content[-1][1], # end
1721 summit = 0,
1722 peak_score = mean_from_value_length( tarray_qscore, tlist_length ),
1723 pileup = mean_from_value_length( tarray_pileup, tlist_length ),
1724 pscore = mean_from_value_length( tarray_pscore, tlist_length ),
1725 fold_change = mean_from_value_length( tarray_fc, tlist_length ),
1726 qscore = mean_from_value_length( tarray_qscore, tlist_length ),
1728 #if chrom == "chr1" and peak_content[0][0] == 237643 and peak_content[-1][1] == 237935:
1729 # print tarray_qscore, tlist_length
1730 # start a new peak
1731 return True
1733 cdef __add_broadpeak (self, bpeaks, bytes chrom, object lvl2peak, list lvl1peakset):
1734 """Internal function to create broad peak.
1736 *Note* lvl1peakset/strong_regions might be empty
1739 cdef:
1740 int32_t blockNum, start, end
1741 bytes blockSizes, blockStarts, thickStart, thickEnd,
1743 start = lvl2peak["start"]
1744 end = lvl2peak["end"]
1746 if not lvl1peakset:
1747 # will complement by adding 1bps start and end to this region
1748 # may change in the future if gappedPeak format was improved.
1749 bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
1750 blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
1751 pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
1752 qscore = lvl2peak["qscore"] )
1753 return bpeaks
1755 thickStart = b"%d" % (lvl1peakset[0]["start"])
1756 thickEnd = b"%d" % (lvl1peakset[-1]["end"])
1757 blockNum = len(lvl1peakset)
1758 blockSizes = b",".join([b"%d" % y for y in [x["length"] for x in lvl1peakset]])
1759 blockStarts = b",".join([b"%d" % x for x in getitem_then_subtract(lvl1peakset, start)])
1761 # add 1bp left and/or right block if necessary
1762 if int(thickStart) != start:
1763 # add 1bp left block
1764 thickStart = b"%d" % start
1765 blockNum += 1
1766 blockSizes = b"1,"+blockSizes
1767 blockStarts = b"0,"+blockStarts
1768 if int(thickEnd) != end:
1769 # add 1bp right block
1770 thickEnd = b"%d" % end
1771 blockNum += 1
1772 blockSizes = blockSizes + b",1"
1773 blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
1775 bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
1776 blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts, pileup = lvl2peak["pileup"],
1777 pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
1778 qscore = lvl2peak["qscore"] )
1779 return bpeaks