1 # cython: language_level=3
3 # cython: linetrace=True
4 # Time-stamp: <2022-09-15 17:06:17 Tao Liu>
6 """Module for Calculate Scores.
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the BSD License (see the file LICENSE included with
13 # ------------------------------------
15 # ------------------------------------
17 from collections
import Counter
19 from time
import time
as ttime
20 import _pickle
as cPickle
21 from tempfile
import mkstemp
25 import MACS3
.Utilities
.Logger
27 logger
= logging
.getLogger
(__name__
)
30 # ------------------------------------
32 # ------------------------------------
35 from numpy cimport uint8_t
, uint16_t
, uint32_t
, uint64_t
, int8_t
, int16_t
, int32_t
, int64_t
, float32_t
, float64_t
36 from cpython cimport bool
37 from cykhash
import PyObjectMap
, Float32to32Map
39 # ------------------------------------
41 # ------------------------------------
42 from libc
.stdio cimport
*
43 from libc
.math cimport exp
,log
,log10
, M_LN10
, log1p
, erf
, sqrt
, floor
, ceil
45 # ------------------------------------
47 # ------------------------------------
48 from MACS3
.Signal
.SignalProcessing
import maxima
, enforce_valleys
, enforce_peakyness
49 from MACS3
.IO
.PeakIO
import PeakIO
, BroadPeakIO
, parse_peakname
50 from MACS3
.Signal
.FixWidthTrack
import FWTrack
51 from MACS3
.Signal
.PairedEndTrack
import PETrackI
52 from MACS3
.Signal
.Prob
import poisson_cdf
53 # --------------------------------------------
54 # cached pscore function and LR_asym functions
55 # --------------------------------------------
56 pscore_dict
= PyObjectMap
()
57 logLR_dict
= PyObjectMap
()
59 cdef float32_t get_pscore
( tuple t
):
60 """t: tuple of ( lambda, observation )
65 return pscore_dict
[ t
]
68 val
= -1.0 * poisson_cdf
( t
[0], t
[1], False
, True
)
69 pscore_dict
[ t
] = val
72 cdef float32_t get_logLR_asym
( tuple t
):
73 """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
74 chromatin bias ). Set minus sign for depletion.
81 return logLR_dict
[ t
]
87 val
= (x
*(log10
(x
)-log10
(y
))+y
-x
)
89 val
= (x
*(-log10
(x
)+log10
(y
))-y
+x
)
95 # ------------------------------------
97 # ------------------------------------
98 __version__
= "CallPeakUnit $Revision$"
99 __author__
= "Tao Liu <vladimir.liu@gmail.com>"
100 __doc__
= "CallPeakUnit"
102 LOG10_E
= 0.43429448190325176
104 # ------------------------------------
106 # ------------------------------------
108 cdef void clean_up_ndarray
( np
.ndarray x
):
109 # clean numpy ndarray in two steps
113 x
.resize
( 100000 if i
> 100000 else i
, refcheck
=False
)
114 x
.resize
( 0, refcheck
=False
)
117 cdef inline float32_t chi2_k1_cdf
( float32_t x
):
118 return erf
( sqrt
(x
/2) )
120 cdef inline float32_t log10_chi2_k1_cdf
( float32_t x
):
121 return log10
( erf
( sqrt
(x
/2) ) )
123 cdef inline float32_t chi2_k2_cdf
( float32_t x
):
124 return 1 - exp
( -x
/2 )
126 cdef inline float32_t log10_chi2_k2_cdf
( float32_t x
):
127 return log1p
( - exp
( -x
/2 ) ) * LOG10_E
129 cdef inline float32_t chi2_k4_cdf
( float32_t x
):
130 return 1 - exp
( -x
/2 ) * ( 1 + x
/2 )
132 cdef inline float32_t log10_chi2_k4_CDF
( float32_t x
):
133 return log1p
( - exp
( -x
/2 ) * ( 1 + x
/2 ) ) * LOG10_E
135 cdef inline np
.ndarray apply_multiple_cutoffs
( list multiple_score_arrays
, list multiple_cutoffs
):
140 ret
= multiple_score_arrays
[0] > multiple_cutoffs
[0]
142 for i
in range(1,len(multiple_score_arrays
)):
143 ret
+= multiple_score_arrays
[i
] > multiple_cutoffs
[i
]
147 cdef inline
list get_from_multiple_scores
( list multiple_score_arrays
, int32_t index
):
152 for i
in range(len(multiple_score_arrays
)):
153 ret
.append
(multiple_score_arrays
[i
][index
])
157 cdef inline float32_t get_logFE
( float32_t x
, float32_t y
):
158 """ return 100* log10 fold enrichment with +1 pseudocount.
162 cdef inline float32_t get_subtraction
( float32_t x
, float32_t y
):
163 """ return subtraction.
167 cdef inline
list getitem_then_subtract
( list peakset
, int32_t start
):
171 a
= [x
["start"] for x
in peakset
]
172 for i
in range(len(a
)):
176 cdef inline int32_t left_sum
( data
, int32_t pos
, int32_t width
):
179 return sum
([data
[x
] for x
in data
if x
<= pos
and x
>= pos
- width
])
181 cdef inline int32_t right_sum
( data
, int32_t pos
, int32_t width
):
184 return sum
([data
[x
] for x
in data
if x
>= pos
and x
<= pos
+ width
])
186 cdef inline int32_t left_forward
( data
, int32_t pos
, int32_t window_size
):
187 return data
.get
(pos
,0) - data
.get
(pos
-window_size
, 0)
189 cdef inline int32_t right_forward
( data
, int32_t pos
, int32_t window_size
):
190 return data
.get
(pos
+ window_size
, 0) - data
.get
(pos
, 0)
192 cdef float32_t median_from_value_length
( np
.ndarray
[np
.float32_t
, ndim
=1] value
, list length
):
198 float32_t tmp_v
, mid_l
201 tmp
= sorted
(list(zip
( value
, length
)))
202 mid_l
= sum
( length
)/2
203 for (tmp_v
, tmp_l
) in tmp
:
208 cdef float32_t mean_from_value_length
( np
.ndarray
[np
.float32_t
, ndim
=1] value
, list length
):
209 """take list of values and list of corresponding lengths, calculate the mean.
210 An important function for bedGraph type of data.
215 float64_t tmp_v
, sum_v
, tmp_sum
#try to solve precision issue
221 for i
in range( len(length
) ):
223 tmp_v
= <float64_t
>value
[ i
]
224 tmp_sum
= tmp_v
* tmp_l
225 sum_v
= tmp_sum
+ sum_v
228 ret
= <float32_t
>(sum_v
/l
)
233 cdef tuple find_optimal_cutoff
( list x
, list y
):
234 """Return the best cutoff x and y.
236 We assume that total peak length increase exponentially while
237 decreasing cutoff value. But while cutoff decreases to a point
238 that background noises are captured, total length increases much
239 faster. So we fit a linear model by taking the first 10 points,
240 then look for the largest cutoff that
244 np
.ndarray npx
, npy
, npA
245 float32_t optimal_x
, optimal_y
247 float32_t m
, c
# slop and intercept
248 float32_t sst
# sum of squared total
249 float32_t sse
# sum of squared error
250 float32_t rsq
# R-squared
255 npy
= np
.log10
( np
.array
( y
) )
256 npA
= np
.vstack
( [npx
, np
.ones
(len(npx
))] ).T
258 for i
in range( 10, l
):
259 # at least the largest 10 points
260 m
, c
= np
.linalg
.lstsq
( npA
[:i
], npy
[:i
], rcond
=None
)[ 0 ]
261 sst
= sum
( ( npy
[:i
] - np
.mean
( npy
[:i
] ) ) ** 2 )
262 sse
= sum
( ( npy
[:i
] - m
*npx
[:i
] - c
) ** 2 )
264 #print i, x[i], y[i], m, c, rsq
269 # ------------------------------------
271 # ------------------------------------
272 cdef class CallerFromAlignments
:
273 """A unit to calculate scores and call peaks from alignments --
274 FWTrack or PETrack objects.
276 It will compute for each chromosome separately in order to save
280 object treat
# FWTrack or PETrackI object for ChIP
281 object ctrl
# FWTrack or PETrackI object for Control
283 int32_t d
# extension size for ChIP
284 list ctrl_d_s
# extension sizes for Control. Can be multiple values
285 float32_t treat_scaling_factor
# scaling factor for ChIP
286 list ctrl_scaling_factor_s
# scaling factor for Control, corresponding to each extension size.
287 float32_t lambda_bg
# minimum local bias to fill missing values
288 list chromosomes
# name of common chromosomes in ChIP and Control data
289 float64_t pseudocount
# the pseudocount used to calcuate logLR, FE or logFE
290 bytes bedGraph_filename_prefix
# prefix will be added to _pileup.bdg for treatment and _lambda.bdg for control
292 int32_t end_shift
# shift of cutting ends before extension
293 bool trackline
# whether trackline should be saved in bedGraph
294 bool save_bedGraph
# whether to save pileup and local bias in bedGraph files
295 bool save_SPMR
# whether to save pileup normalized by sequencing depth in million reads
296 bool no_lambda_flag
# whether ignore local bias, and to use global bias instead
297 bool PE_mode
# whether it's in PE mode, will be detected during initiation
299 # temporary data buffer
300 list chr_pos_treat_ctrl
# temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
301 bytes bedGraph_treat_filename
302 bytes bedGraph_control_filename
303 FILE
* bedGraph_treat_f
304 FILE
* bedGraph_ctrl_f
306 # data needed to be pre-computed before peak calling
307 object pqtable
# remember pvalue->qvalue convertion; saved in cykhash Float32to32Map
308 bool pvalue_all_done
# whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value.
310 dict pvalue_npeaks
# record for each pvalue cutoff, how many peaks can be called
311 dict pvalue_length
# record for each pvalue cutoff, the total length of called peaks
312 float32_t optimal_p_cutoff
# automatically decide the p-value cutoff ( can be translated into qvalue cutoff ) based
313 # on p-value to total peak length analysis.
314 bytes cutoff_analysis_filename
# file to save the pvalue-npeaks-totallength table
316 dict pileup_data_files
# Record the names of temporary files for storing pileup values of each chromosome
319 def __init__
(self, treat
, ctrl
,
320 int32_t d
= 200, list ctrl_d_s
= [200, 1000, 10000],
321 float32_t treat_scaling_factor
= 1.0, list ctrl_scaling_factor_s
= [1.0, 0.2, 0.02],
322 bool stderr_on
= False
,
323 float32_t pseudocount
= 1,
324 int32_t end_shift
= 0,
325 float32_t lambda_bg
= 0,
326 bool save_bedGraph
= False
,
327 str bedGraph_filename_prefix
= "PREFIX",
328 str bedGraph_treat_filename
= "TREAT.bdg",
329 str bedGraph_control_filename
= "CTRL.bdg",
330 str cutoff_analysis_filename
= "TMP.txt",
331 bool save_SPMR
= False
):
334 A calculator is unique to each comparison of treat and
335 control. Treat_depth and ctrl_depth should not be changed
338 treat and ctrl are either FWTrack or PETrackI objects.
340 treat_depth and ctrl_depth are effective depth in million:
341 sequencing depth in million after
342 duplicates being filtered. If
343 treatment is scaled down to
344 control sample size, then this
345 should be control sample size in
346 million. And vice versa.
348 d, sregion, lregion: d is the fragment size, sregion is the
349 small region size, lregion is the large
352 pseudocount: a pseudocount used to calculate logLR, FE or
353 logFE. Please note this value will not be changed
354 with normalization method. So if you really want
355 to set pseudocount 1 per million reads, set it
356 after you normalize treat and control by million
357 reads by `change_normalizetion_method(ord('M'))`.
367 if isinstance(treat
, FWTrack
):
369 elif isinstance(treat
, PETrackI
):
372 raise Exception("Should be FWTrack or PETrackI object!")
373 # decide if there is control
377 else: # while there is no control
379 self.trackline
= False
380 self.d
= d
# note, self.d doesn't make sense in PE mode
381 self.ctrl_d_s
= ctrl_d_s
# note, self.d doesn't make sense in PE mode
382 self.treat_scaling_factor
= treat_scaling_factor
383 self.ctrl_scaling_factor_s
= ctrl_scaling_factor_s
384 self.end_shift
= end_shift
385 self.lambda_bg
= lambda_bg
386 self.pqtable
= Float32to32Map
( for_int
= False
) # Float32 -> Float32 map
387 self.save_bedGraph
= save_bedGraph
388 self.save_SPMR
= save_SPMR
389 self.bedGraph_filename_prefix
= bedGraph_filename_prefix
.encode
()
390 self.bedGraph_treat_filename
= bedGraph_treat_filename
.encode
()
391 self.bedGraph_control_filename
= bedGraph_control_filename
.encode
()
392 if not self.ctrl_d_s
or not self.ctrl_scaling_factor_s
:
393 self.no_lambda_flag
= True
395 self.no_lambda_flag
= False
396 self.pseudocount
= pseudocount
397 # get the common chromosome names from both treatment and control
398 chr1
= set
(self.treat
.get_chr_names
())
399 chr2
= set
(self.ctrl
.get_chr_names
())
400 self.chromosomes
= sorted
(list(chr1
.intersection
(chr2
)))
402 self.pileup_data_files
= {}
403 self.pvalue_length
= {}
404 self.pvalue_npeaks
= {}
405 for p
in np
.arange
( 0.3, 10, 0.3 ): # step for optimal cutoff is 0.3 in -log10pvalue, we try from pvalue 1E-10 (-10logp=10) to 0.5 (-10logp=0.3)
406 self.pvalue_length
[ p
] = 0
407 self.pvalue_npeaks
[ p
] = 0
408 self.optimal_p_cutoff
= 0
409 self.cutoff_analysis_filename
= cutoff_analysis_filename
.encode
()
411 cpdef destroy
( self ):
412 """Remove temporary files for pileup values of each chromosome.
414 Note: This function MUST be called if the class object won't
421 for f
in self.pileup_data_files
.values
():
422 if os
.path
.isfile
( f
):
426 cpdef set_pseudocount
( self, float32_t pseudocount
):
427 self.pseudocount
= pseudocount
429 cpdef enable_trackline
( self ):
430 """Turn on trackline with bedgraph output
432 self.trackline
= True
434 cdef __pileup_treat_ctrl_a_chromosome
( self, bytes chrom
):
435 """After this function is called, self.chr_pos_treat_ctrl will
436 be reset and assigned to the pileup values of the given
441 list treat_pv
, ctrl_pv
447 assert chrom
in self.chromosomes
, "chromosome %s is not valid." % chrom
449 # check backup file of pileup values. If not exists, create
450 # it. Otherwise, load them instead of calculating new pileup
452 if chrom
in self.pileup_data_files
:
454 f
= open( self.pileup_data_files
[ chrom
],"rb" )
455 self.chr_pos_treat_ctrl
= cPickle
.load
( f
)
459 temp_fd
, temp_filename
= mkstemp
()
461 self.pileup_data_files
[ chrom
] = temp_filename
463 temp_fd
, temp_filename
= mkstemp
()
465 self.pileup_data_files
[ chrom
] = temp_filename
.encode
()
467 # reset or clean existing self.chr_pos_treat_ctrl
468 if self.chr_pos_treat_ctrl
: # not a beautiful way to clean
469 clean_up_ndarray
( self.chr_pos_treat_ctrl
[0] )
470 clean_up_ndarray
( self.chr_pos_treat_ctrl
[1] )
471 clean_up_ndarray
( self.chr_pos_treat_ctrl
[2] )
474 treat_pv
= self.treat
.pileup_a_chromosome
( chrom
, [self.treat_scaling_factor
,], baseline_value
= 0.0 )
476 treat_pv
= self.treat
.pileup_a_chromosome
( chrom
, [self.d
,], [self.treat_scaling_factor
,], baseline_value
= 0.0,
478 end_shift
= self.end_shift
)
480 if not self.no_lambda_flag
:
482 # note, we pileup up PE control as SE control because
483 # we assume the bias only can be captured at the
484 # surrounding regions of cutting sites from control experiments.
485 ctrl_pv
= self.ctrl
.pileup_a_chromosome_c
( chrom
, self.ctrl_d_s
, self.ctrl_scaling_factor_s
, baseline_value
= self.lambda_bg
)
487 ctrl_pv
= self.ctrl
.pileup_a_chromosome
( chrom
, self.ctrl_d_s
, self.ctrl_scaling_factor_s
,
488 baseline_value
= self.lambda_bg
,
489 directional
= False
)
491 ctrl_pv
= [treat_pv
[0][-1:], np
.array
([self.lambda_bg
,], dtype
="float32")] # set a global lambda
493 self.chr_pos_treat_ctrl
= self.__chrom_pair_treat_ctrl
( treat_pv
, ctrl_pv
)
495 # clean treat_pv and ctrl_pv
499 # save data to temporary file
501 f
= open(self.pileup_data_files
[ chrom
],"wb")
502 cPickle
.dump
( self.chr_pos_treat_ctrl
, f
, protocol
=2 )
505 # fail to write then remove the key in pileup_data_files
506 self.pileup_data_files
.pop
(chrom
)
509 cdef list __chrom_pair_treat_ctrl
( self, treat_pv
, ctrl_pv
):
510 """*private* Pair treat and ctrl pileup for each region.
512 treat_pv and ctrl_pv are [np.ndarray, np.ndarray].
514 return [p, t, c] list, each element is a numpy array.
518 int64_t pre_p
, index_ret
, it
, ic
, lt
, lc
519 np
.ndarray
[np
.int32_t
, ndim
=1] t_p
, c_p
, ret_p
520 np
.ndarray
[np
.float32_t
, ndim
=1] t_v
, c_v
, ret_t
, ret_c
528 float32_t
* ret_t_ptr
529 float32_t
* ret_c_ptr
531 [ t_p
, t_v
] = treat_pv
532 [ c_p
, c_v
] = ctrl_pv
537 chrom_max_len
= lt
+ lc
539 ret_p
= np
.zeros
( chrom_max_len
, dtype
="int32" ) # position
540 ret_t
= np
.zeros
( chrom_max_len
, dtype
="float32" ) # value from treatment
541 ret_c
= np
.zeros
( chrom_max_len
, dtype
="float32" ) # value from control
543 t_p_ptr
= <int32_t
*> t_p
.data
544 t_v_ptr
= <float32_t
*> t_v
.data
545 c_p_ptr
= <int32_t
*> c_p
.data
546 c_v_ptr
= <float32_t
*> c_v
.data
547 ret_p_ptr
= <int32_t
*> ret_p
.data
548 ret_t_ptr
= <float32_t
*>ret_t
.data
549 ret_c_ptr
= <float32_t
*>ret_c
.data
556 while it
< lt
and ic
< lc
:
557 if t_p_ptr
[0] < c_p_ptr
[0]:
558 # clip a region from pre_p to p1, then set pre_p as p1.
559 ret_p_ptr
[0] = t_p_ptr
[0]
560 ret_t_ptr
[0] = t_v_ptr
[0]
561 ret_c_ptr
[0] = c_v_ptr
[0]
567 # call for the next p1 and v1
571 elif t_p_ptr
[0] > c_p_ptr
[0]:
572 # clip a region from pre_p to p2, then set pre_p as p2.
573 ret_p_ptr
[0] = c_p_ptr
[0]
574 ret_t_ptr
[0] = t_v_ptr
[0]
575 ret_c_ptr
[0] = c_v_ptr
[0]
581 # call for the next p2 and v2
586 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
587 ret_p_ptr
[0] = t_p_ptr
[0]
588 ret_t_ptr
[0] = t_v_ptr
[0]
589 ret_c_ptr
[0] = c_v_ptr
[0]
595 # call for the next p1, v1, p2, v2.
603 ret_p
.resize
( index_ret
, refcheck
=False
)
604 ret_t
.resize
( index_ret
, refcheck
=False
)
605 ret_c
.resize
( index_ret
, refcheck
=False
)
606 return [ret_p
, ret_t
, ret_c
]
608 cdef np
.ndarray __cal_score
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
, cal_func
):
611 np
.ndarray
[np
.float32_t
, ndim
=1] s
612 assert array1
.shape
[0] == array2
.shape
[0]
613 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
614 for i
in range(array1
.shape
[0]):
615 s
[i
] = cal_func
( array1
[i
], array2
[i
] )
618 cdef void __cal_pvalue_qvalue_table
( self ):
619 """After this function is called, self.pqtable is built. All
620 chromosomes will be iterated. So it will take some time.
625 np
.ndarray pos_array
, treat_array
, ctrl_array
, score_array
627 int64_t n
, pre_p
, length
, pre_l
, l
, i
, j
628 float32_t this_v
, pre_v
, v
, q
, pre_q
633 float32_t
* treat_value_ptr
634 float32_t
* ctrl_value_ptr
636 debug
( "Start to calculate pvalue stat..." )
638 pscore_stat
= {} #dict()
639 for i
in range( len( self.chromosomes
) ):
640 chrom
= self.chromosomes
[ i
]
643 self.__pileup_treat_ctrl_a_chromosome
( chrom
)
644 [pos_array
, treat_array
, ctrl_array
] = self.chr_pos_treat_ctrl
646 pos_ptr
= <int32_t
*> pos_array
.data
647 treat_value_ptr
= <float32_t
*> treat_array
.data
648 ctrl_value_ptr
= <float32_t
*> ctrl_array
.data
650 for j
in range(pos_array
.shape
[0]):
651 this_v
= get_pscore
( (<int32_t
>(treat_value_ptr
[0]), ctrl_value_ptr
[0] ) )
652 this_l
= pos_ptr
[0] - pre_p
653 if this_v
in pscore_stat
:
654 pscore_stat
[ this_v
] += this_l
656 pscore_stat
[ this_v
] = this_l
662 N
= sum
(pscore_stat
.values
()) # total length
667 pre_q
= 2147483647 # save the previous q-value
669 self.pqtable
= Float32to32Map
( for_int
= False
)
670 unique_values
= sorted
(list(pscore_stat
.keys
()), reverse
=True
)
671 for i
in range(len(unique_values
)):
674 q
= v
+ (log10
(k
) + f
)
680 #q = max(0,min(pre_q,q)) # make q-score monotonic
681 self.pqtable
[ v
] = q
684 # bottom rank pscores all have qscores 0
685 for j
in range(i
, len(unique_values
) ):
686 v
= unique_values
[ j
]
687 self.pqtable
[ v
] = 0
690 cdef void __pre_computes
( self, int32_t max_gap
= 50, int32_t min_length
= 200 ):
691 """After this function is called, self.pqtable and self.pvalue_length is built. All
692 chromosomes will be iterated. So it will take some time.
697 np
.ndarray pos_array
, treat_array
, ctrl_array
, score_array
699 int64_t n
, pre_p
, this_p
, length
, j
, pre_l
, l
, i
700 float32_t q
, pre_q
, this_t
, this_c
701 float32_t this_v
, pre_v
, v
, cutoff
707 np
.ndarray above_cutoff
, above_cutoff_endpos
, above_cutoff_startpos
709 int64_t peak_length
, total_l
, total_p
713 int32_t
* acs_ptr
# above cutoff start position pointer
714 int32_t
* ace_ptr
# above cutoff end position pointer
715 int32_t
* pos_array_ptr
# position array pointer
716 float32_t
* score_array_ptr
# score array pointer
718 debug
( "Start to calculate pvalue stat..." )
720 # tmplist contains a list of log pvalue cutoffs from 0.3 to 10
721 tmplist
= [round(x
,5) for x
in sorted
( list(np
.arange
(0.3, 10.0, 0.3)), reverse
= True
)]
723 pscore_stat
= {} #dict()
724 #print (list(pscore_stat.keys()))
725 #print (list(self.pvalue_length.keys()))
726 #print (list(self.pvalue_npeaks.keys()))
727 for i
in range( len( self.chromosomes
) ):
728 chrom
= self.chromosomes
[ i
]
729 self.__pileup_treat_ctrl_a_chromosome
( chrom
)
730 [pos_array
, treat_array
, ctrl_array
] = self.chr_pos_treat_ctrl
732 score_array
= self.__cal_pscore
( treat_array
, ctrl_array
)
734 for n
in range( len( tmplist
) ):
735 cutoff
= tmplist
[ n
]
736 total_l
= 0 # total length in potential peak
739 # get the regions with scores above cutoffs
740 above_cutoff
= np
.nonzero
( score_array
> cutoff
)[0]# this is not an optimized method. It would be better to store score array in a 2-D ndarray?
741 above_cutoff_endpos
= pos_array
[above_cutoff
] # end positions of regions where score is above cutoff
742 above_cutoff_startpos
= pos_array
[above_cutoff
-1] # start positions of regions where score is above cutoff
744 if above_cutoff_endpos
.size
== 0:
747 # first bit of region above cutoff
748 acs_ptr
= <int32_t
*> above_cutoff_startpos
.data
749 ace_ptr
= <int32_t
*> above_cutoff_endpos
.data
751 peak_content
= [( acs_ptr
[ 0 ], ace_ptr
[ 0 ] ), ]
756 for i
in range( 1, above_cutoff_startpos
.size
):
757 tl
= acs_ptr
[ 0 ] - lastp
759 peak_content
.append
( ( acs_ptr
[ 0 ], ace_ptr
[ 0 ] ) )
761 peak_length
= peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ]
762 if peak_length
>= min_length
: # if the peak is too small, reject it
763 total_l
+= peak_length
765 peak_content
= [ ( acs_ptr
[ 0 ], ace_ptr
[ 0 ] ), ]
771 peak_length
= peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ]
772 if peak_length
>= min_length
: # if the peak is too small, reject it
773 total_l
+= peak_length
775 self.pvalue_length
[ cutoff
] = self.pvalue_length
.get
( cutoff
, 0 ) + total_l
776 self.pvalue_npeaks
[ cutoff
] = self.pvalue_npeaks
.get
( cutoff
, 0 ) + total_p
778 pos_array_ptr
= <int32_t
*> pos_array
.data
779 score_array_ptr
= <float32_t
*> score_array
.data
782 for i
in range(pos_array
.shape
[0]):
783 this_p
= pos_array_ptr
[ 0 ]
784 this_l
= this_p
- pre_p
785 this_v
= score_array_ptr
[ 0 ]
786 if this_v
in pscore_stat
:
787 pscore_stat
[ this_v
] += this_l
789 pscore_stat
[ this_v
] = this_l
790 pre_p
= this_p
#pos_array[ i ]
794 #debug ( "make pscore_stat cost %.5f seconds" % t )
796 # add all pvalue cutoffs from cutoff-analysis part. So that we
797 # can get the corresponding qvalues for them.
798 for cutoff
in tmplist
:
799 if cutoff
not in pscore_stat
:
800 pscore_stat
[ cutoff
] = 0
804 N
= sum
(pscore_stat
.values
()) # total length
809 pre_q
= 2147483647 # save the previous q-value
811 self.pqtable
= Float32to32Map
( for_int
= False
) #{}
812 unique_values
= sorted
(list(pscore_stat
.keys
()), reverse
=True
) #sorted(unique_values,reverse=True)
813 for i
in range(len(unique_values
)):
816 q
= v
+ (log10
(k
) + f
)
822 #q = max(0,min(pre_q,q)) # make q-score monotonic
823 self.pqtable
[ v
] = q
827 for j
in range(i
, len(unique_values
) ):
828 v
= unique_values
[ j
]
829 self.pqtable
[ v
] = 0
831 # write pvalue and total length of predicted peaks
832 # this is the output from cutoff-analysis
833 fhd
= open( self.cutoff_analysis_filename
, "w" )
834 fhd
.write
( "pscore\tqscore\tnpeaks\tlpeaks\tavelpeak\n" )
837 for cutoff
in tmplist
:
838 if self.pvalue_npeaks
[ cutoff
] > 0:
839 fhd
.write
( "%.2f\t%.2f\t%d\t%d\t%.2f\n" % ( cutoff
, self.pqtable
[ cutoff
], self.pvalue_npeaks
[ cutoff
], self.pvalue_length
[ cutoff
], self.pvalue_length
[ cutoff
]/self.pvalue_npeaks
[ cutoff
] ) )
841 y
.append
( self.pvalue_length
[ cutoff
] )
843 info
( "#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename
)
844 #info( "#3 Suggest a cutoff..." )
845 #optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
846 #info( "#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
847 #print (list(pqtable.keys()))
848 #print (list(self.pvalue_length.keys()))
849 #print (list(self.pvalue_npeaks.keys()))
852 cpdef call_peaks
( self, list scoring_function_symbols
, list score_cutoff_s
, int32_t min_length
= 200,
853 int32_t max_gap
= 50, bool call_summits
= False
, bool cutoff_analysis
= False
):
854 """Call peaks for all chromosomes. Return a PeakIO object.
856 scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
857 score_cutoff_s : cutoff values corresponding to scoring functions
858 min_length : minimum length of peak
859 max_gap : maximum gap of 'insignificant' regions within a peak. Note, for PE_mode, max_gap and max_length are both set as fragment length.
860 call_summits : boolean. Whether or not call sub-peaks.
861 save_bedGraph : whether or not to save pileup and control into a bedGraph file
870 if len( self.pqtable
) == 0:
871 info
("#3 Pre-compute pvalue-qvalue table...")
873 info
("#3 Cutoff vs peaks called will be analyzed!")
874 self.__pre_computes
( max_gap
= max_gap
, min_length
= min_length
)
876 self.__cal_pvalue_qvalue_table
()
879 # prepare bedGraph file
880 if self.save_bedGraph
:
881 self.bedGraph_treat_f
= fopen
( self.bedGraph_treat_filename
, "w" )
882 self.bedGraph_ctrl_f
= fopen
( self.bedGraph_control_filename
, "w" )
884 info
("#3 In the peak calling step, the following will be performed simultaneously:")
885 info
("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix
.decode
() + "_treat_pileup.bdg")
886 info
("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix
.decode
() + "_control_lambda.bdg")
889 info
( "#3 --SPMR is requested, so pileup will be normalized by sequencing depth in million reads." )
890 elif self.treat_scaling_factor
== 1:
891 info
( "#3 Pileup will be based on sequencing depth in treatment." )
893 info
( "#3 Pileup will be based on sequencing depth in control." )
896 # this line is REQUIRED by the wiggle format for UCSC browser
897 tmp_bytes
= ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix
).encode
()
898 fprintf
( self.bedGraph_treat_f
, tmp_bytes
)
899 tmp_bytes
= ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix
).encode
()
900 fprintf
( self.bedGraph_ctrl_f
, tmp_bytes
)
902 info
("#3 Call peaks for each chromosome...")
903 for chrom
in self.chromosomes
:
904 # treat/control bedGraph will be saved if requested by user.
905 self.__chrom_call_peak_using_certain_criteria
( peaks
, chrom
, scoring_function_symbols
, score_cutoff_s
, min_length
, max_gap
, call_summits
, self.save_bedGraph
)
907 # close bedGraph file
908 if self.save_bedGraph
:
909 fclose
(self.bedGraph_treat_f
)
910 fclose
(self.bedGraph_ctrl_f
)
911 self.save_bedGraph
= False
915 cdef void __chrom_call_peak_using_certain_criteria
( self, peaks
, bytes chrom
, list scoring_function_s
, list score_cutoff_s
, int32_t min_length
,
916 int32_t max_gap
, bool call_summits
, bool save_bedGraph
):
917 """ Call peaks for a chromosome.
919 Combination of criteria is allowed here.
921 peaks: a PeakIO object, the return value of this function
922 scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
923 save_bedGraph : whether or not to save pileup and control into a bedGraph file
929 np
.ndarray above_cutoff
930 np
.ndarray
[np
.int32_t
, ndim
=1] above_cutoff_endpos
, above_cutoff_startpos
, pos_array
, above_cutoff_index_array
932 np
.ndarray
[np
.float32_t
, ndim
=1] treat_array
, ctrl_array
933 list score_array_s
# list to keep different types of scores
934 list peak_content
# to store information for a
935 # chunk in a peak region, it
936 # contains lists of: 1. left
938 # position; 3. treatment
939 # value; 4. control value;
940 # 5. list of scores at this
942 int64_t tl
, lastp
, ts
, te
, ti
947 float32_t
* treat_array_ptr
948 float32_t
* ctrl_array_ptr
951 assert len(scoring_function_s
) == len(score_cutoff_s
), "number of functions and cutoffs should be the same!"
953 peak_content
= [] # to store points above cutoff
955 # first, build pileup, self.chr_pos_treat_ctrl
956 # this step will be speeped up if pqtable is pre-computed.
957 self.__pileup_treat_ctrl_a_chromosome
( chrom
)
958 [pos_array
, treat_array
, ctrl_array
] = self.chr_pos_treat_ctrl
960 # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
962 self.__write_bedGraph_for_a_chromosome
( chrom
)
964 # keep all types of scores needed
967 for i
in range(len(scoring_function_s
)):
968 s
= scoring_function_s
[i
]
970 score_array_s
.append
( self.__cal_pscore
( treat_array
, ctrl_array
) )
972 score_array_s
.append
( self.__cal_qscore
( treat_array
, ctrl_array
) )
974 score_array_s
.append
( self.__cal_FE
( treat_array
, ctrl_array
) )
976 score_array_s
.append
( self.__cal_subtraction
( treat_array
, ctrl_array
) )
978 # get the regions with scores above cutoffs
979 above_cutoff
= np
.nonzero
( apply_multiple_cutoffs
(score_array_s
,score_cutoff_s
) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
980 above_cutoff_index_array
= np
.arange
(pos_array
.shape
[0],dtype
="int32")[above_cutoff
] # indices
981 above_cutoff_endpos
= pos_array
[above_cutoff
] # end positions of regions where score is above cutoff
982 above_cutoff_startpos
= pos_array
[above_cutoff
-1] # start positions of regions where score is above cutoff
984 if above_cutoff
.size
== 0:
985 # nothing above cutoff
988 if above_cutoff
[0] == 0:
989 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
990 above_cutoff_startpos
[0] = 0
992 #print "apply cutoff -- chrom:",chrom," time:", ttime() - t0
993 # start to build peak regions
996 # first bit of region above cutoff
997 acs_ptr
= <int32_t
*>above_cutoff_startpos
.data
998 ace_ptr
= <int32_t
*>above_cutoff_endpos
.data
999 acia_ptr
= <int32_t
*>above_cutoff_index_array
.data
1000 treat_array_ptr
= <float32_t
*> treat_array
.data
1001 ctrl_array_ptr
= <float32_t
*> ctrl_array
.data
1006 tp
= treat_array_ptr
[ ti
]
1007 cp
= ctrl_array_ptr
[ ti
]
1009 peak_content
.append
( ( ts
, te
, tp
, cp
, ti
) )
1015 for i
in range( 1, above_cutoff_startpos
.shape
[0] ):
1022 tp
= treat_array_ptr
[ ti
]
1023 cp
= ctrl_array_ptr
[ ti
]
1027 peak_content
.append
( ( ts
, te
, tp
, cp
, ti
) )
1028 lastp
= te
#above_cutoff_endpos[i]
1032 self.__close_peak_with_subpeaks
(peak_content
, peaks
, min_length
, chrom
, min_length
, score_array_s
, score_cutoff_s
= score_cutoff_s
) # smooth length is min_length, i.e. fragment size 'd'
1034 self.__close_peak_wo_subpeaks
(peak_content
, peaks
, min_length
, chrom
, min_length
, score_array_s
, score_cutoff_s
= score_cutoff_s
) # smooth length is min_length, i.e. fragment size 'd'
1035 peak_content
= [ ( ts
, te
, tp
, cp
, ti
), ]
1036 lastp
= te
#above_cutoff_endpos[i]
1037 # save the last peak
1038 if not peak_content
:
1042 self.__close_peak_with_subpeaks
(peak_content
, peaks
, min_length
, chrom
, min_length
, score_array_s
, score_cutoff_s
= score_cutoff_s
) # smooth length is min_length, i.e. fragment size 'd'
1044 self.__close_peak_wo_subpeaks
(peak_content
, peaks
, min_length
, chrom
, min_length
, score_array_s
, score_cutoff_s
= score_cutoff_s
) # smooth length is min_length, i.e. fragment size 'd'
1046 #print "close peaks -- chrom:",chrom," time:", ttime() - t0
1049 cdef bool __close_peak_wo_subpeaks
(self, list peak_content
, peaks
, int32_t min_length
,
1050 bytes chrom
, int32_t smoothlen
, list score_array_s
, list score_cutoff_s
=[]):
1051 """Close the peak region, output peak boundaries, peak summit
1052 and scores, then add the peak to peakIO object.
1054 peak_content contains [start, end, treat_p, ctrl_p, index_in_score_array]
1056 peaks: a PeakIO object
1060 int32_t summit_pos
, tstart
, tend
, tmpindex
, summit_index
, i
, midindex
1061 float64_t treat_v
, ctrl_v
, tsummitvalue
, ttreat_p
, tctrl_p
, tscore
, summit_treat
, summit_ctrl
, summit_p_score
, summit_q_score
1062 int32_t tlist_scores_p
1064 peak_length
= peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ]
1065 if peak_length
>= min_length
: # if the peak is too small, reject it
1069 for i
in range(len(peak_content
)):
1070 (tstart
, tend
, ttreat_p
, tctrl_p
, tlist_scores_p
) = peak_content
[i
]
1071 tscore
= ttreat_p
# use pscore as general score to find summit
1072 if not summit_value
or summit_value
< tscore
:
1073 tsummit
= [(tend
+ tstart
) // 2, ]
1074 tsummit_index
= [ i
, ]
1075 summit_value
= tscore
1076 elif summit_value
== tscore
:
1077 # remember continuous summit values
1078 tsummit
.append
((tend
+ tstart
) // 2)
1079 tsummit_index
.append
( i
)
1080 # the middle of all highest points in peak region is defined as summit
1081 midindex
= (len(tsummit
) + 1) // 2 - 1
1082 summit_pos
= tsummit
[ midindex
]
1083 summit_index
= tsummit_index
[ midindex
]
1085 summit_treat
= peak_content
[ summit_index
][ 2 ]
1086 summit_ctrl
= peak_content
[ summit_index
][ 3 ]
1088 # this is a double-check to see if the summit can pass cutoff values.
1089 for i
in range(len(score_cutoff_s
)):
1090 if score_cutoff_s
[i
] > score_array_s
[ i
][ peak_content
[ summit_index
][ 4 ] ]:
1091 return False
# not passed, then disgard this peak.
1093 summit_p_score
= pscore_dict
[ ( <int32_t
>(summit_treat
), summit_ctrl
) ] #get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
1094 summit_q_score
= self.pqtable
[ summit_p_score
]
1096 peaks
.add
( chrom
, # chromosome
1097 peak_content
[0][0], # start
1098 peak_content
[-1][1], # end
1099 summit
= summit_pos
, # summit position
1100 peak_score
= summit_q_score
, # score at summit
1101 pileup
= summit_treat
, # pileup
1102 pscore
= summit_p_score
, # pvalue
1103 fold_change
= ( summit_treat
+ self.pseudocount
) / ( summit_ctrl
+ self.pseudocount
), # fold change
1104 qscore
= summit_q_score
# qvalue
1109 cdef bool __close_peak_with_subpeaks
(self, list peak_content
, peaks
, int32_t min_length
,
1110 bytes chrom
, int32_t smoothlen
, list score_array_s
, list score_cutoff_s
=[],
1111 float32_t min_valley
= 0.9 ):
1112 """Algorithm implemented by Ben, to profile the pileup signals
1113 within a peak region then find subpeak summits. This method is
1114 highly recommended for TFBS or DNAase I sites.
1118 int32_t summit_pos
, tstart
, tend
, tmpindex
, summit_index
, summit_offset
1119 int32_t start
, end
, i
, j
, start_boundary
, m
, n
, l
1120 float64_t summit_value
, tvalue
, tsummitvalue
, ttreat_p
, tctrl_p
, tscore
, summit_treat
, summit_ctrl
, summit_p_score
, summit_q_score
1121 np
.ndarray
[np
.float32_t
, ndim
=1] peakdata
1122 np
.ndarray
[np
.int32_t
, ndim
=1] peakindices
, summit_offsets
1123 int32_t tlist_scores_p
1125 peak_length
= peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ]
1127 if peak_length
< min_length
: return # if the region is too small, reject it
1129 # Add 10 bp padding to peak region so that we can get true minima
1130 end
= peak_content
[ -1 ][ 1 ] + 10
1131 start
= peak_content
[ 0 ][ 0 ] - 10
1133 start_boundary
= 10 + start
# this is the offset of original peak boundary in peakdata list.
1136 start_boundary
= 10 # this is the offset of original peak boundary in peakdata list.
1138 peakdata
= np
.zeros
(end
- start
, dtype
='float32') # save the scores (qscore) for each position in this region
1139 peakindices
= np
.zeros
(end
- start
, dtype
='int32') # save the indices for each position in this region
1140 for i
in range(len(peak_content
)):
1141 (tstart
, tend
, ttreat_p
, tctrl_p
, tlist_scores_p
) = peak_content
[i
]
1142 tscore
= ttreat_p
# use pileup as general score to find summit
1143 m
= tstart
- start
+ start_boundary
1144 n
= tend
- start
+ start_boundary
1145 peakdata
[m
:n
] = tscore
1146 peakindices
[m
:n
] = i
1148 summit_offsets
= maxima
(peakdata
, smoothlen
) # offsets are the indices for summits in peakdata/peakindices array.
1150 if summit_offsets
.shape
[0] == 0:
1151 # **failsafe** if no summits, fall back on old approach #
1152 return self.__close_peak_wo_subpeaks
(peak_content
, peaks
, min_length
, chrom
, smoothlen
, score_array_s
, score_cutoff_s
)
1154 # remove maxima that occurred in padding
1155 m
= np
.searchsorted
(summit_offsets
, start_boundary
)
1156 n
= np
.searchsorted
(summit_offsets
, peak_length
+ start_boundary
, 'right')
1157 summit_offsets
= summit_offsets
[m
:n
]
1159 summit_offsets
= enforce_peakyness
(peakdata
, summit_offsets
)
1161 #print "enforced:",summit_offsets
1162 if summit_offsets
.shape
[0] == 0:
1163 # **failsafe** if no summits, fall back on old approach #
1164 return self.__close_peak_wo_subpeaks
(peak_content
, peaks
, min_length
, chrom
, smoothlen
, score_array_s
, score_cutoff_s
)
1166 summit_indices
= peakindices
[summit_offsets
] # indices are those point to peak_content
1167 summit_offsets
-= start_boundary
1169 for summit_offset
, summit_index
in list(zip
(summit_offsets
, summit_indices
)):
1171 summit_treat
= peak_content
[ summit_index
][ 2 ]
1172 summit_ctrl
= peak_content
[ summit_index
][ 3 ]
1174 summit_p_score
= pscore_dict
[ ( <int32_t
>(summit_treat
), summit_ctrl
) ] # get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
1175 summit_q_score
= self.pqtable
[ summit_p_score
]
1177 for i
in range(len(score_cutoff_s
)):
1178 if score_cutoff_s
[i
] > score_array_s
[ i
][ peak_content
[ summit_index
][ 4 ] ]:
1179 return False
# not passed, then disgard this summit.
1182 peak_content
[ 0 ][ 0 ],
1183 peak_content
[ -1 ][ 1 ],
1184 summit
= start
+ summit_offset
,
1185 peak_score
= summit_q_score
,
1186 pileup
= summit_treat
,
1187 pscore
= summit_p_score
,
1188 fold_change
= (summit_treat
+ self.pseudocount
) / ( summit_ctrl
+ self.pseudocount
), # fold change
1189 qscore
= summit_q_score
1194 cdef np
.ndarray __cal_pscore
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
):
1196 int64_t i
, array1_size
1197 np
.ndarray
[np
.float32_t
, ndim
=1] s
1202 assert array1
.shape
[0] == array2
.shape
[0]
1203 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
1205 a1_ptr
= <float32_t
*> array1
.data
1206 a2_ptr
= <float32_t
*> array2
.data
1207 s_ptr
= <float32_t
*> s
.data
1209 array1_size
= array1
.shape
[0]
1211 for i
in range(array1_size
):
1212 s_ptr
[0] = get_pscore
(( <int32_t
>(a1_ptr
[0]), a2_ptr
[0] ))
1218 cdef np
.ndarray __cal_qscore
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
):
1220 int64_t i
, array1_size
1221 np
.ndarray
[np
.float32_t
, ndim
=1] s
1226 assert array1
.shape
[0] == array2
.shape
[0]
1227 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
1229 a1_ptr
= <float32_t
*> array1
.data
1230 a2_ptr
= <float32_t
*> array2
.data
1231 s_ptr
= <float32_t
*> s
.data
1233 for i
in range(array1
.shape
[0]):
1234 s_ptr
[0] = self.pqtable
[ get_pscore
(( <int32_t
>(a1_ptr
[0]), a2_ptr
[0] )) ]
1240 cdef np
.ndarray __cal_logLR
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
):
1242 int64_t i
, array1_size
1243 np
.ndarray
[np
.float32_t
, ndim
=1] s
1248 assert array1
.shape
[0] == array2
.shape
[0]
1249 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
1251 a1_ptr
= <float32_t
*> array1
.data
1252 a2_ptr
= <float32_t
*> array2
.data
1253 s_ptr
= <float32_t
*> s
.data
1255 for i
in range(array1
.shape
[0]):
1256 s_ptr
[0] = get_logLR_asym
( (a1_ptr
[0] + self.pseudocount
, a2_ptr
[0] + self.pseudocount
) )
1262 cdef np
.ndarray __cal_logFE
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
):
1264 int64_t i
, array1_size
1265 np
.ndarray
[np
.float32_t
, ndim
=1] s
1270 assert array1
.shape
[0] == array2
.shape
[0]
1271 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
1273 a1_ptr
= <float32_t
*> array1
.data
1274 a2_ptr
= <float32_t
*> array2
.data
1275 s_ptr
= <float32_t
*> s
.data
1277 for i
in range(array1
.shape
[0]):
1278 s_ptr
[0] = get_logFE
( a1_ptr
[0] + self.pseudocount
, a2_ptr
[0] + self.pseudocount
)
1284 cdef np
.ndarray __cal_FE
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
):
1286 int64_t i
, array1_size
1287 np
.ndarray
[np
.float32_t
, ndim
=1] s
1292 assert array1
.shape
[0] == array2
.shape
[0]
1293 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
1295 a1_ptr
= <float32_t
*> array1
.data
1296 a2_ptr
= <float32_t
*> array2
.data
1297 s_ptr
= <float32_t
*> s
.data
1299 for i
in range(array1
.shape
[0]):
1300 s_ptr
[0] = (a1_ptr
[0] + self.pseudocount
) / ( a2_ptr
[0] + self.pseudocount
)
1306 cdef np
.ndarray __cal_subtraction
( self, np
.ndarray
[np
.float32_t
, ndim
=1] array1
, np
.ndarray
[np
.float32_t
, ndim
=1] array2
):
1308 int64_t i
, array1_size
1309 np
.ndarray
[np
.float32_t
, ndim
=1] s
1314 assert array1
.shape
[0] == array2
.shape
[0]
1315 s
= np
.zeros
(array1
.shape
[0], dtype
="float32")
1317 a1_ptr
= <float32_t
*> array1
.data
1318 a2_ptr
= <float32_t
*> array2
.data
1319 s_ptr
= <float32_t
*> s
.data
1321 for i
in range(array1
.shape
[0]):
1322 s_ptr
[0] = a1_ptr
[0] - a2_ptr
[0]
1329 cdef bool __write_bedGraph_for_a_chromosome
( self, bytes chrom
):
1330 """Write treat/control values for a certain chromosome into a
1331 specified file handler.
1335 np
.ndarray
[np
.int32_t
, ndim
=1] pos_array
1336 np
.ndarray
[np
.float32_t
, ndim
=1] treat_array
, ctrl_array
1337 int32_t
* pos_array_ptr
1338 float32_t
* treat_array_ptr
1339 float32_t
* ctrl_array_ptr
1341 int32_t p
, pre_p_t
, pre_p_c
# current position, previous position for treat, previous position for control
1342 float32_t pre_v_t
, pre_v_c
, v_t
, v_c
# previous value for treat, for control, current value for treat, for control
1343 float32_t denominator
# 1 if save_SPMR is false, or depth in million if save_SPMR is true. Note, while piling up and calling peaks, treatment and control have been scaled to the same depth, so we need to find what this 'depth' is.
1346 basestring tmp_bytes
1348 [pos_array
, treat_array
, ctrl_array
] = self.chr_pos_treat_ctrl
1349 pos_array_ptr
= <int32_t
*> pos_array
.data
1350 treat_array_ptr
= <float32_t
*> treat_array
.data
1351 ctrl_array_ptr
= <float32_t
*> ctrl_array
.data
1354 if self.treat_scaling_factor
== 1:
1355 # in this case, control has been asked to be scaled to depth of treatment
1356 denominator
= self.treat
.total
/1e6
1358 # in this case, treatment has been asked to be scaled to depth of control
1359 denominator
= self.ctrl
.total
/1e6
1363 l
= pos_array
.shape
[ 0 ]
1365 if l
== 0: # if there is no data, return
1368 ft
= self.bedGraph_treat_f
1369 fc
= self.bedGraph_ctrl_f
1370 #t_write_func = self.bedGraph_treat.write
1371 #c_write_func = self.bedGraph_ctrl.write
1375 pre_v_t
= treat_array_ptr
[ 0 ]/denominator
1376 pre_v_c
= ctrl_array_ptr
[ 0 ]/denominator
1377 treat_array_ptr
+= 1
1380 for i
in range( 1, l
):
1381 v_t
= treat_array_ptr
[ 0 ]/denominator
1382 v_c
= ctrl_array_ptr
[ 0 ]/denominator
1383 p
= pos_array_ptr
[ 0 ]
1385 treat_array_ptr
+= 1
1388 if abs(pre_v_t
- v_t
) > 1e-5: # precision is 5 digits
1389 fprintf
( ft
, b
"%s\t%d\t%d\t%.5f\n", chrom
, pre_p_t
, p
, pre_v_t
)
1393 if abs(pre_v_c
- v_c
) > 1e-5: # precision is 5 digits
1394 fprintf
( fc
, b
"%s\t%d\t%d\t%.5f\n", chrom
, pre_p_c
, p
, pre_v_c
)
1398 p
= pos_array_ptr
[ 0 ]
1400 fprintf
( ft
, b
"%s\t%d\t%d\t%.5f\n", chrom
, pre_p_t
, p
, pre_v_t
)
1401 fprintf
( fc
, b
"%s\t%d\t%d\t%.5f\n", chrom
, pre_p_c
, p
, pre_v_c
)
1405 cpdef call_broadpeaks
(self, list scoring_function_symbols
, list lvl1_cutoff_s
, list lvl2_cutoff_s
, int32_t min_length
=200, int32_t lvl1_max_gap
=50, int32_t lvl2_max_gap
=400, bool cutoff_analysis
= False
):
1406 """This function try to find enriched regions within which,
1407 scores are continuously higher than a given cutoff for level
1408 1, and link them using the gap above level 2 cutoff with a
1409 maximum length of lvl2_max_gap.
1411 scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
1413 lvl1_cutoff_s: list of cutoffs at highly enriched regions, corresponding to scoring functions.
1414 lvl2_cutoff_s: list of cutoffs at less enriched regions, corresponding to scoring functions.
1415 min_length : minimum peak length, default 200.
1416 lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50.
1417 lvl2_max_gap : maximum length of linkage regions, default 400.
1419 Return both general PeakIO object for highly enriched regions
1420 and gapped broad regions in BroadPeakIO.
1425 object lvl1peaks
, lvl1peakschrom
, lvl1
1426 object lvl2peaks
, lvl2peakschrom
, lvl2
1431 lvl1peaks
= PeakIO
()
1432 lvl2peaks
= PeakIO
()
1435 if len( self.pqtable
) == 0:
1436 info
("#3 Pre-compute pvalue-qvalue table...")
1438 info
("#3 Cutoff value vs broad region calls will be analyzed!")
1439 self.__pre_computes
( max_gap
= lvl2_max_gap
, min_length
= min_length
)
1441 self.__cal_pvalue_qvalue_table
()
1443 # prepare bedGraph file
1444 if self.save_bedGraph
:
1446 self.bedGraph_treat_f
= fopen
( self.bedGraph_treat_filename
, "w" )
1447 self.bedGraph_ctrl_f
= fopen
( self.bedGraph_control_filename
, "w" )
1448 info
("#3 In the peak calling step, the following will be performed simultaneously:")
1449 info
("#3 Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix
.decode
() + "_treat_pileup.bdg")
1450 info
("#3 Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix
.decode
() + "_control_lambda.bdg")
1453 # this line is REQUIRED by the wiggle format for UCSC browser
1454 tmp_bytes
= ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix
).encode
()
1455 fprintf
( self.bedGraph_treat_f
, tmp_bytes
)
1456 tmp_bytes
= ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix
).encode
()
1457 fprintf
( self.bedGraph_ctrl_f
, tmp_bytes
)
1460 info
("#3 Call peaks for each chromosome...")
1461 for chrom
in self.chromosomes
:
1462 self.__chrom_call_broadpeak_using_certain_criteria
( lvl1peaks
, lvl2peaks
, chrom
, scoring_function_symbols
, lvl1_cutoff_s
, lvl2_cutoff_s
, min_length
, lvl1_max_gap
, lvl2_max_gap
, self.save_bedGraph
)
1464 # close bedGraph file
1465 if self.save_bedGraph
:
1466 fclose
( self.bedGraph_treat_f
)
1467 fclose
( self.bedGraph_ctrl_f
)
1468 #self.bedGraph_ctrl.close()
1469 self.save_bedGraph
= False
1471 # now combine lvl1 and lvl2 peaks
1472 chrs
= lvl1peaks
.get_chr_names
()
1473 broadpeaks
= BroadPeakIO
()
1474 # use lvl2_peaks as linking regions between lvl1_peaks
1475 for chrom
in sorted
(chrs
):
1476 lvl1peakschrom
= lvl1peaks
.get_data_from_chrom
(chrom
)
1477 lvl2peakschrom
= lvl2peaks
.get_data_from_chrom
(chrom
)
1478 lvl1peakschrom_next
= iter
(lvl1peakschrom
).__next__
1479 tmppeakset
= [] # to temporarily store lvl1 region inside a lvl2 region
1480 # our assumption is lvl1 regions should be included in lvl2 regions
1482 lvl1
= lvl1peakschrom_next
()
1483 for i
in range( len(lvl2peakschrom
) ):
1484 # for each lvl2 peak, find all lvl1 peaks inside
1485 # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
1486 lvl2
= lvl2peakschrom
[i
]
1489 if lvl2
["start"] <= lvl1
["start"] and lvl1
["end"] <= lvl2
["end"]:
1490 tmppeakset
.append
(lvl1
)
1491 lvl1
= lvl1peakschrom_next
()
1493 # make a hierarchical broad peak
1494 #print lvl2["start"], lvl2["end"], lvl2["score"]
1495 self.__add_broadpeak
( broadpeaks
, chrom
, lvl2
, tmppeakset
)
1498 except StopIteration
:
1499 # no more strong (aka lvl1) peaks left
1500 self.__add_broadpeak
( broadpeaks
, chrom
, lvl2
, tmppeakset
)
1502 # add the rest lvl2 peaks
1503 for j
in range( i
+1, len(lvl2peakschrom
) ):
1504 self.__add_broadpeak
( broadpeaks
, chrom
, lvl2peakschrom
[j
], tmppeakset
)
1508 cdef void __chrom_call_broadpeak_using_certain_criteria
( self, lvl1peaks
, lvl2peaks
, bytes chrom
, list scoring_function_s
, list lvl1_cutoff_s
, list lvl2_cutoff_s
,
1509 int32_t min_length
, int32_t lvl1_max_gap
, int32_t lvl2_max_gap
, bool save_bedGraph
):
1510 """ Call peaks for a chromosome.
1512 Combination of criteria is allowed here.
1514 peaks: a PeakIO object
1515 scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
1516 save_bedGraph : whether or not to save pileup and control into a bedGraph file
1521 np
.ndarray above_cutoff
, above_cutoff_endpos
, above_cutoff_startpos
1522 np
.ndarray pos_array
, treat_array
, ctrl_array
1523 np
.ndarray above_cutoff_index_array
1524 list score_array_s
# list to keep different types of scores
1529 float32_t
* treat_array_ptr
1530 float32_t
* ctrl_array_ptr
1532 assert len(scoring_function_s
) == len(lvl1_cutoff_s
), "number of functions and cutoffs should be the same!"
1533 assert len(scoring_function_s
) == len(lvl2_cutoff_s
), "number of functions and cutoffs should be the same!"
1535 # first, build pileup, self.chr_pos_treat_ctrl
1536 self.__pileup_treat_ctrl_a_chromosome
( chrom
)
1537 [pos_array
, treat_array
, ctrl_array
] = self.chr_pos_treat_ctrl
1539 # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
1541 self.__write_bedGraph_for_a_chromosome
( chrom
)
1543 # keep all types of scores needed
1545 for i
in range(len(scoring_function_s
)):
1546 s
= scoring_function_s
[i
]
1548 score_array_s
.append
( self.__cal_pscore
( treat_array
, ctrl_array
) )
1550 score_array_s
.append
( self.__cal_qscore
( treat_array
, ctrl_array
) )
1552 score_array_s
.append
( self.__cal_FE
( treat_array
, ctrl_array
) )
1554 score_array_s
.append
( self.__cal_subtraction
( treat_array
, ctrl_array
) )
1556 # lvl1 : strong peaks
1557 peak_content
= [] # to store points above cutoff
1559 # get the regions with scores above cutoffs
1560 above_cutoff
= np
.nonzero
( apply_multiple_cutoffs
(score_array_s
,lvl1_cutoff_s
) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
1561 above_cutoff_index_array
= np
.arange
(pos_array
.shape
[0],dtype
="int32")[above_cutoff
] # indices
1562 above_cutoff_endpos
= pos_array
[above_cutoff
] # end positions of regions where score is above cutoff
1563 above_cutoff_startpos
= pos_array
[above_cutoff
-1] # start positions of regions where score is above cutoff
1565 if above_cutoff
.size
== 0:
1566 # nothing above cutoff
1569 if above_cutoff
[0] == 0:
1570 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1571 above_cutoff_startpos
[0] = 0
1573 # first bit of region above cutoff
1574 acs_ptr
= <int32_t
*>above_cutoff_startpos
.data
1575 ace_ptr
= <int32_t
*>above_cutoff_endpos
.data
1576 acia_ptr
= <int32_t
*>above_cutoff_index_array
.data
1577 treat_array_ptr
= <float32_t
*> treat_array
.data
1578 ctrl_array_ptr
= <float32_t
*> ctrl_array
.data
1583 tp
= treat_array_ptr
[ ti
]
1584 cp
= ctrl_array_ptr
[ ti
]
1586 peak_content
.append
( ( ts
, te
, tp
, cp
, ti
) )
1587 acs_ptr
+= 1 # move ptr
1592 #peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], score_array_s, above_cutoff_index_array[0] ) )
1593 for i
in range( 1, above_cutoff_startpos
.size
):
1600 tp
= treat_array_ptr
[ ti
]
1601 cp
= ctrl_array_ptr
[ ti
]
1603 if tl
<= lvl1_max_gap
:
1605 #peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i] ) )
1606 peak_content
.append
( ( ts
, te
, tp
, cp
, ti
) )
1610 self.__close_peak_for_broad_region
(peak_content
, lvl1peaks
, min_length
, chrom
, lvl1_max_gap
//2, score_array_s
)
1611 #peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i]) , ]
1612 peak_content
= [ ( ts
, te
, tp
, cp
, ti
), ]
1613 lastp
= te
#above_cutoff_endpos[i]
1615 # save the last peak
1617 self.__close_peak_for_broad_region
(peak_content
, lvl1peaks
, min_length
, chrom
, lvl1_max_gap
//2, score_array_s
)
1620 peak_content
= [] # to store points above cutoff
1622 # get the regions with scores above cutoffs
1623 above_cutoff
= np
.nonzero
( apply_multiple_cutoffs
(score_array_s
,lvl2_cutoff_s
) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
1624 above_cutoff_index_array
= np
.arange
(pos_array
.shape
[0],dtype
="int32")[above_cutoff
] # indices
1625 above_cutoff_endpos
= pos_array
[above_cutoff
] # end positions of regions where score is above cutoff
1626 above_cutoff_startpos
= pos_array
[above_cutoff
-1] # start positions of regions where score is above cutoff
1628 if above_cutoff
.size
== 0:
1629 # nothing above cutoff
1632 if above_cutoff
[0] == 0:
1633 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1634 above_cutoff_startpos
[0] = 0
1636 # first bit of region above cutoff
1637 acs_ptr
= <int32_t
*>above_cutoff_startpos
.data
1638 ace_ptr
= <int32_t
*>above_cutoff_endpos
.data
1639 acia_ptr
= <int32_t
*>above_cutoff_index_array
.data
1640 treat_array_ptr
= <float32_t
*> treat_array
.data
1641 ctrl_array_ptr
= <float32_t
*> ctrl_array
.data
1646 tp
= treat_array_ptr
[ ti
]
1647 cp
= ctrl_array_ptr
[ ti
]
1648 peak_content
.append
( ( ts
, te
, tp
, cp
, ti
) )
1649 acs_ptr
+= 1 # move ptr
1654 for i
in range( 1, above_cutoff_startpos
.size
):
1655 # for everything above cutoff
1656 ts
= acs_ptr
[ 0 ] # get the start
1657 te
= ace_ptr
[ 0 ] # get the end
1658 ti
= acia_ptr
[ 0 ]# get the index
1660 acs_ptr
+= 1 # move ptr
1663 tp
= treat_array_ptr
[ ti
] # get the treatment pileup
1664 cp
= ctrl_array_ptr
[ ti
] # get the control pileup
1665 tl
= ts
- lastp
# get the distance from the current point to last position of existing peak_content
1667 if tl
<= lvl2_max_gap
:
1669 peak_content
.append
( ( ts
, te
, tp
, cp
, ti
) )
1673 self.__close_peak_for_broad_region
(peak_content
, lvl2peaks
, min_length
, chrom
, lvl2_max_gap
//2, score_array_s
)
1675 peak_content
= [ ( ts
, te
, tp
, cp
, ti
), ]
1678 # save the last peak
1680 self.__close_peak_for_broad_region
(peak_content
, lvl2peaks
, min_length
, chrom
, lvl2_max_gap
//2, score_array_s
)
1684 cdef bool __close_peak_for_broad_region
(self, list peak_content
, peaks
, int32_t min_length
,
1685 bytes chrom
, int32_t smoothlen
, list score_array_s
, list score_cutoff_s
=[]):
1686 """Close the broad peak region, output peak boundaries, peak summit
1687 and scores, then add the peak to peakIO object.
1689 peak_content contains [start, end, treat_p, ctrl_p, list_scores]
1691 peaks: a BroadPeakIO object
1695 int32_t summit_pos
, tstart
, tend
, tmpindex
, summit_index
, i
, midindex
1696 float64_t treat_v
, ctrl_v
, tsummitvalue
, ttreat_p
, tctrl_p
, tscore
, summit_treat
, summit_ctrl
, summit_p_score
, summit_q_score
1697 list tlist_pileup
, tlist_control
, tlist_length
1698 int32_t tlist_scores_p
1699 np
.ndarray tarray_pileup
, tarray_control
, tarray_pscore
, tarray_qscore
, tarray_fc
1701 peak_length
= peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ]
1702 if peak_length
>= min_length
: # if the peak is too small, reject it
1706 for i
in range(len(peak_content
)): # each position in broad peak
1707 (tstart
, tend
, ttreat_p
, tctrl_p
, tlist_scores_p
) = peak_content
[i
]
1708 tlist_pileup
.append
( ttreat_p
)
1709 tlist_control
.append
( tctrl_p
)
1710 tlist_length
.append
( tend
- tstart
)
1712 tarray_pileup
= np
.array
( tlist_pileup
, dtype
="float32")
1713 tarray_control
= np
.array
( tlist_control
, dtype
="float32")
1714 tarray_pscore
= self.__cal_pscore
( tarray_pileup
, tarray_control
)
1715 tarray_qscore
= self.__cal_qscore
( tarray_pileup
, tarray_control
)
1716 tarray_fc
= self.__cal_FE
( tarray_pileup
, tarray_control
)
1718 peaks
.add
( chrom
, # chromosome
1719 peak_content
[0][0], # start
1720 peak_content
[-1][1], # end
1722 peak_score
= mean_from_value_length
( tarray_qscore
, tlist_length
),
1723 pileup
= mean_from_value_length
( tarray_pileup
, tlist_length
),
1724 pscore
= mean_from_value_length
( tarray_pscore
, tlist_length
),
1725 fold_change
= mean_from_value_length
( tarray_fc
, tlist_length
),
1726 qscore
= mean_from_value_length
( tarray_qscore
, tlist_length
),
1728 #if chrom == "chr1" and peak_content[0][0] == 237643 and peak_content[-1][1] == 237935:
1729 # print tarray_qscore, tlist_length
1733 cdef __add_broadpeak
(self, bpeaks
, bytes chrom
, object lvl2peak
, list lvl1peakset
):
1734 """Internal function to create broad peak.
1736 *Note* lvl1peakset/strong_regions might be empty
1740 int32_t blockNum
, start
, end
1741 bytes blockSizes
, blockStarts
, thickStart
, thickEnd
,
1743 start
= lvl2peak
["start"]
1744 end
= lvl2peak
["end"]
1747 # will complement by adding 1bps start and end to this region
1748 # may change in the future if gappedPeak format was improved.
1749 bpeaks
.add
(chrom
, start
, end
, score
=lvl2peak
["score"], thickStart
=(b
"%d" % start
), thickEnd
=(b
"%d" % end
),
1750 blockNum
= 2, blockSizes
= b
"1,1", blockStarts
= (b
"0,%d" % (end
-start
-1)), pileup
= lvl2peak
["pileup"],
1751 pscore
= lvl2peak
["pscore"], fold_change
= lvl2peak
["fc"],
1752 qscore
= lvl2peak
["qscore"] )
1755 thickStart
= b
"%d" % (lvl1peakset
[0]["start"])
1756 thickEnd
= b
"%d" % (lvl1peakset
[-1]["end"])
1757 blockNum
= len(lvl1peakset
)
1758 blockSizes
= b
",".join
([b
"%d" % y
for y
in [x
["length"] for x
in lvl1peakset
]])
1759 blockStarts
= b
",".join
([b
"%d" % x
for x
in getitem_then_subtract
(lvl1peakset
, start
)])
1761 # add 1bp left and/or right block if necessary
1762 if int(thickStart
) != start
:
1763 # add 1bp left block
1764 thickStart
= b
"%d" % start
1766 blockSizes
= b
"1,"+blockSizes
1767 blockStarts
= b
"0,"+blockStarts
1768 if int(thickEnd
) != end
:
1769 # add 1bp right block
1770 thickEnd
= b
"%d" % end
1772 blockSizes
= blockSizes
+ b
",1"
1773 blockStarts
= blockStarts
+ b
"," + (b
"%d" % (end
-start
-1))
1775 bpeaks
.add
(chrom
, start
, end
, score
=lvl2peak
["score"], thickStart
=thickStart
, thickEnd
=thickEnd
,
1776 blockNum
= blockNum
, blockSizes
= blockSizes
, blockStarts
= blockStarts
, pileup
= lvl2peak
["pileup"],
1777 pscore
= lvl2peak
["pscore"], fold_change
= lvl2peak
["fc"],
1778 qscore
= lvl2peak
["qscore"] )