1 # cython: language_level=3
3 # Time-stamp: <2023-08-02 14:19:28 Tao Liu>
5 """Module Description: For pileup functions.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
15 from time
import time
as ttime
17 # ------------------------------------
19 # ------------------------------------
20 from MACS3
.Utilities
.Constants
import *
21 from MACS3
.Signal
.cPosValCalculation cimport single_end_pileup
as c_single_end_pileup
22 from MACS3
.Signal
.cPosValCalculation cimport write_pv_array_to_bedGraph
as c_write_pv_array_to_bedGraph
23 from MACS3
.Signal
.cPosValCalculation cimport PosVal
24 from MACS3
.Signal
.cPosValCalculation cimport quick_pileup
as c_quick_pileup
26 # ------------------------------------
28 # ------------------------------------
31 from numpy cimport int32_t
, float32_t
32 from cpython cimport bool
33 from cpython cimport PyObject
35 # ------------------------------------
37 # ------------------------------------
38 from libc
.stdlib cimport malloc
, free
, qsort
40 # ------------------------------------
41 # utility internal functions
42 # ------------------------------------
43 cdef inline
float mean
( float a
, float b
):
46 cdef void clean_up_ndarray
( np
.ndarray x
):
47 """ Clean up numpy array in two steps
52 x
.resize
( 100000 if i
> 100000 else i
, refcheck
=False
)
53 x
.resize
( 0, refcheck
=False
)
56 cdef np
.ndarray
[np
.int32_t
, ndim
=1] fix_coordinates
(np
.ndarray
[np
.int32_t
, ndim
=1] poss
, int rlength
):
57 """Fix the coordinates.
63 ptr
= <int32_t
*> poss
.data
65 # fix those negative coordinates
66 for i
in range( poss
.shape
[0] ):
72 # fix those over-boundary coordinates
73 for i
in range( poss
.shape
[0]-1, -1, -1 ):
80 # ------------------------------------
82 # ------------------------------------
84 # ------------------------------------
85 # functions for pileup_cmd only
86 # ------------------------------------
88 # This function uses pure C code for pileup
89 cpdef
void pileup_and_write_se
( trackI
,
90 bytes output_filename
,
93 float baseline_value
= 0.0,
94 bint directional
= True
,
95 bint halfextension
= True
):
96 """ Pileup a FWTrackI object and write the pileup result into a
99 This function calls pure C functions from cPosValCalculation.
101 This function is currently only used `macs3 pileup` cmd.
104 long five_shift
, three_shift
, l
, i
108 np
.ndarray
[np
.int32_t
, ndim
=1] plus_tags
, minus_tags
109 dict chrlengths
= trackI
.get_rlengths
()
119 # This block should be reused to determine the actual shift values
121 # only extend to 3' side
123 five_shift
= d
//-4 # five shift is used to move cursor towards 5' direction to find the start of fragment
124 three_shift
= d
*3//4 # three shift is used to move cursor towards 3' direction to find the end of fragment
132 three_shift
= five_shift
135 three_shift
= d
- five_shift
138 chroms
= list(chrlengths
.keys
())
139 n_chroms
= len( chroms
)
141 fh
= open(output_filename
, "w")
145 for i
in range( n_chroms
):
147 (plus_tags
, minus_tags
) = trackI
.get_locations_by_chr
(chrom
)
148 rlength
= <long> chrlengths
[ chrom
]
149 plus_tags_pos
= <int *> plus_tags
.data
150 minus_tags_pos
= <int *> minus_tags
.data
152 _data
= c_single_end_pileup
( plus_tags_pos
, plus_tags
.shape
[0], minus_tags_pos
, minus_tags
.shape
[0], five_shift
, three_shift
, 0, rlength
, scale_factor
, baseline_value
, &l_data
)
156 chrom_char
= py_bytes
157 c_write_pv_array_to_bedGraph
( _data
, l_data
, chrom_char
, output_filename
, 1 )
163 # function to pileup BAMPE/BEDPE stored in PETrackI object and write to a BEDGraph file
164 # this function uses c function
165 cpdef pileup_and_write_pe
( petrackI
,
166 bytes output_filename
,
167 float scale_factor
= 1,
168 float baseline_value
= 0.0):
169 """ Pileup a PETrackI object and write the pileup result into a
172 This function calls pure C functions from cPosValCalculation.
174 This function is currently only used `macs3 pileup` cmd.
177 dict chrlengths
= petrackI
.get_rlengths
()
184 np
.ndarray
[np
.int32_t
, ndim
=1] locs0
185 np
.ndarray
[np
.int32_t
, ndim
=1] locs1
195 chroms
= list(chrlengths
.keys
())
196 n_chroms
= len( chroms
)
198 fh
= open(output_filename
, "w")
202 for i
in range( n_chroms
):
204 locs
= petrackI
.get_locations_by_chr
(chrom
)
206 locs0
= np
.sort
(locs
['l'])
207 locs1
= np
.sort
(locs
['r'])
208 start_pos
= <int *> locs0
.data
209 end_pos
= <int *> locs1
.data
211 _data
= c_quick_pileup
( start_pos
, end_pos
, locs0
.shape
[0], scale_factor
, baseline_value
, &l_data
)
215 chrom_char
= py_bytes
216 c_write_pv_array_to_bedGraph
( _data
, l_data
, chrom_char
, output_filename
, 1 )
222 # ------------------------------------
223 # functions for other codes
224 # ------------------------------------
226 # general pileup function implemented in cython
227 cpdef
list se_all_in_one_pileup
( np
.ndarray
[np
.int32_t
, ndim
=1] plus_tags
, np
.ndarray
[np
.int32_t
, ndim
=1] minus_tags
, long five_shift
, long three_shift
, int rlength
, float scale_factor
, float baseline_value
):
228 """Return pileup given 5' end of fragment at plus or minus strand
229 separately, and given shift at both direction to recover a
230 fragment. This function is for single end sequencing library
231 only. Please directly use 'quick_pileup' function for Pair-end
234 It contains a super-fast and simple algorithm proposed by Jie
235 Wang. It will take sorted start positions and end positions, then
236 compute pileup values.
238 It will return a pileup result in similar structure as
239 bedGraph. There are two python arrays:
241 [end positions, values] or '[p,v] array' in other description for
242 functions within MACS3.
244 Two arrays have the same length and can be matched by index. End
245 position at index x (p[x]) record continuous value of v[x] from
251 int a
, b
, p
, pre_p
, pileup
253 long lp
= plus_tags
.shape
[0]
254 long lm
= minus_tags
.shape
[0]
256 np
.ndarray
[np
.int32_t
, ndim
=1] start_poss
, end_poss
, ret_p
257 np
.ndarray
[np
.float32_t
, ndim
=1] ret_v
258 # pointers are used for numpy arrays
259 int32_t
* start_poss_ptr
260 int32_t
* end_poss_ptr
261 int32_t
* ret_p_ptr
# pointer for position array
262 float32_t
* ret_v_ptr
# pointer for value array
265 start_poss
= np
.concatenate
( ( plus_tags
-five_shift
, minus_tags
-three_shift
) )
266 end_poss
= np
.concatenate
( ( plus_tags
+three_shift
, minus_tags
+five_shift
) )
272 # fix negative coordinations and those extends over end of chromosomes
273 start_poss
= fix_coordinates
(start_poss
, rlength
)
274 end_poss
= fix_coordinates
(end_poss
, rlength
)
276 lx
= start_poss
.shape
[0]
278 start_poss_ptr
= <int32_t
*> start_poss
.data
279 end_poss_ptr
= <int32_t
*> end_poss
.data
281 ret_p
= np
.zeros
( 2 * lx
, dtype
="int32" )
282 ret_v
= np
.zeros
( 2 * lx
, dtype
="float32" )
284 ret_p_ptr
= <int32_t
*> ret_p
.data
285 ret_v_ptr
= <float32_t
*> ret_v
.data
287 tmp
= [ret_p
, ret_v
] # for (endpos,value)
288 i_s
= 0 # index of start_poss
289 i_e
= 0 # index of end_poss
293 if start_poss
.shape
[0] == 0: return tmp
294 pre_p
= min(start_poss_ptr
[0],end_poss_ptr
[0])
297 # the first chunk of 0
299 ret_v_ptr
[0] = max(0,baseline_value
)
306 assert start_poss
.shape
[0] == end_poss
.shape
[0]
307 lx
= start_poss
.shape
[0]
309 while i_s
< lx
and i_e
< lx
:
310 if start_poss_ptr
[0] < end_poss_ptr
[0]:
311 p
= start_poss_ptr
[0]
314 ret_v_ptr
[0] = max(pileup
* scale_factor
, baseline_value
)
322 elif start_poss_ptr
[0] > end_poss_ptr
[0]:
326 ret_v_ptr
[0] = max(pileup
* scale_factor
, baseline_value
)
341 # add rest of end positions
342 for i
in range(i_e
, lx
):
346 ret_v_ptr
[0] = max(pileup
* scale_factor
, baseline_value
)
355 clean_up_ndarray
( start_poss
)
356 clean_up_ndarray
( end_poss
)
359 ret_p
.resize
( I
, refcheck
=False
)
360 ret_v
.resize
( I
, refcheck
=False
)
364 # quick pileup implemented in cython
365 cpdef
list quick_pileup
( np
.ndarray
[np
.int32_t
, ndim
=1] start_poss
, np
.ndarray
[np
.int32_t
, ndim
=1] end_poss
, float scale_factor
, float baseline_value
):
366 """Return pileup given plus strand and minus strand positions of fragments.
368 A super-fast and simple algorithm proposed by Jie Wang. It will
369 take sorted start positions and end positions, then compute pileup
372 It will return a pileup result in similar structure as
373 bedGraph. There are two python arrays:
375 [end positions, values] or [p,v]
377 Two arrays have the same length and can be matched by index. End
378 position at index x (p[x]) record continuous value of v[x] from
384 int a
, b
, p
, pre_p
, pileup
386 long ls
= start_poss
.shape
[0]
387 long le
= end_poss
.shape
[0]
389 np
.ndarray
[np
.int32_t
, ndim
=1] ret_p
390 np
.ndarray
[np
.float32_t
, ndim
=1] ret_v
391 # pointers are used for numpy arrays
392 int32_t
* start_poss_ptr
393 int32_t
* end_poss_ptr
394 int32_t
* ret_p_ptr
# pointer for position array
395 float32_t
* ret_v_ptr
# pointer for value array
398 start_poss_ptr
= <int32_t
*> start_poss
.data
399 end_poss_ptr
= <int32_t
*> end_poss
.data
401 ret_p
= np
.zeros
( l
, dtype
="int32" )
402 ret_v
= np
.zeros
( l
, dtype
="float32" )
404 ret_p_ptr
= <int32_t
*> ret_p
.data
405 ret_v_ptr
= <float32_t
*> ret_v
.data
407 tmp
= [ret_p
, ret_v
] # for (endpos,value)
409 i_s
= 0 # index of plus_tags
410 i_e
= 0 # index of minus_tags
414 if ls
== 0: return tmp
415 pre_p
= min(start_poss_ptr
[0], end_poss_ptr
[0])
418 # the first chunk of 0
420 ret_v_ptr
[0] = max(0,baseline_value
)
427 while i_s
< ls
and i_e
< le
:
428 if start_poss_ptr
[0] < end_poss_ptr
[0]:
429 p
= start_poss_ptr
[0]
432 ret_v_ptr
[0] = max(pileup
* scale_factor
, baseline_value
)
438 #if pileup > max_pileup:
439 # max_pileup = pileup
442 elif start_poss_ptr
[0] > end_poss_ptr
[0]:
446 ret_v_ptr
[0] = max(pileup
* scale_factor
, baseline_value
)
461 # add rest of end positions
462 for i
in range(i_e
, le
):
464 #for p in minus_tags[i_e:]:
467 ret_v_ptr
[0] = max(pileup
* scale_factor
, baseline_value
)
475 ret_p
.resize
( I
, refcheck
=False
)
476 ret_v
.resize
( I
, refcheck
=False
)
480 # quick pileup implemented in cython
481 cpdef
list naive_quick_pileup
( np
.ndarray
[np
.int32_t
, ndim
=1] sorted_poss
, int extension
):
482 """Simple pileup, every tag will be extended left and right with length `extension`.
484 Note: Assumption is that `poss` has to be pre-sorted! There is no
485 check on whether it's sorted.
489 int a
, b
, p
, pre_p
, pileup
490 long l
= sorted_poss
.shape
[0]
491 np
.ndarray
[np
.int32_t
, ndim
=1] start_poss
492 np
.ndarray
[np
.int32_t
, ndim
=1] end_poss
493 np
.ndarray
[np
.int32_t
, ndim
=1] ret_p
494 np
.ndarray
[np
.float32_t
, ndim
=1] ret_v
495 # pointers are used for numpy arrays
496 int32_t
* start_poss_ptr
497 int32_t
* end_poss_ptr
498 int32_t
* ret_p_ptr
# pointer for position array
499 float32_t
* ret_v_ptr
# pointer for value array
501 start_poss
= sorted_poss
- extension
502 start_poss
[ start_poss
< 0 ] = 0
503 end_poss
= sorted_poss
+ extension
505 start_poss_ptr
= <int32_t
*> start_poss
.data
506 end_poss_ptr
= <int32_t
*> end_poss
.data
509 ret_p
= np
.zeros
( 2*l
, dtype
="int32" )
510 ret_v
= np
.zeros
( 2*l
, dtype
="float32" )
512 ret_p_ptr
= <int32_t
*> ret_p
.data
513 ret_v_ptr
= <float32_t
*> ret_v
.data
515 i_s
= 0 # index of plus_tags
516 i_e
= 0 # index of minus_tags
520 if l
== 0: return ret
521 pre_p
= min(start_poss_ptr
[0], end_poss_ptr
[0])
524 # the first chunk of 0
533 while i_s
< l
and i_e
< l
:
534 if start_poss_ptr
[0] < end_poss_ptr
[0]:
535 p
= start_poss_ptr
[0]
538 ret_v_ptr
[0] = pileup
546 elif start_poss_ptr
[0] > end_poss_ptr
[0]:
550 ret_v_ptr
[0] = pileup
564 # add rest of end positions
566 for i
in range(i_e
, l
):
570 ret_v_ptr
[0] = pileup
578 ret_p
.resize
( I
, refcheck
=False
)
579 ret_v
.resize
( I
, refcheck
=False
)
581 return [ ret_p
, ret_v
]
583 # general function to compare two pv arrays in cython.
584 cpdef
list over_two_pv_array
( list pv_array1
, list pv_array2
, func
="max" ):
585 """Merge two position-value arrays. For intersection regions, take
586 the maximum value within region.
588 pv_array1 and pv_array2 are [p,v] type lists, same as the output
589 from quick_pileup function. 'p' and 'v' are numpy arrays of int32
592 available operations are 'max', 'min', and 'mean'
598 np
.ndarray
[np
.int32_t
, ndim
=1] a1_pos
, a2_pos
, ret_pos
599 np
.ndarray
[np
.float32_t
, ndim
=1] a1_v
, a2_v
, ret_v
602 int32_t
* ret_pos_ptr
605 float32_t
* ret_v_ptr
606 long l1
, l2
, l
, i1
, i2
, I
615 raise Exception("Invalid function")
617 [ a1_pos
, a1_v
] = pv_array1
618 [ a2_pos
, a2_v
] = pv_array2
619 ret_pos
= np
.zeros
( a1_pos
.shape
[0] + a2_pos
.shape
[0], dtype
="int32" )
620 ret_v
= np
.zeros
( a1_pos
.shape
[0] + a2_pos
.shape
[0], dtype
="float32" )
622 a1_pos_ptr
= <int32_t
*> a1_pos
.data
623 a1_v_ptr
= <float32_t
*> a1_v
.data
624 a2_pos_ptr
= <int32_t
*> a2_pos
.data
625 a2_v_ptr
= <float32_t
*> a2_v
.data
626 ret_pos_ptr
= <int32_t
*> ret_pos
.data
627 ret_v_ptr
= <float32_t
*> ret_v
.data
636 pre_p
= 0 # remember the previous position in the new bedGraphTrackI object ret
638 while i1
< l1
and i2
< l2
:
639 ret_v_ptr
[0] = f
( a1_v_ptr
[0], a2_v_ptr
[0] )
641 if a1_pos_ptr
[0] < a2_pos_ptr
[0]:
642 # clip a region from pre_p to p1, then set pre_p as p1.
643 ret_pos_ptr
[0] = a1_pos_ptr
[0]
646 pre_p
= a1_pos_ptr
[0]
647 # call for the next p1 and v1
651 elif a1_pos_ptr
[0] > a2_pos_ptr
[0]:
652 # clip a region from pre_p to p2, then set pre_p as p2.
653 ret_pos_ptr
[0] = a2_pos_ptr
[0]
656 pre_p
= a2_pos_ptr
[0]
657 # call for the next p1 and v1
662 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
663 ret_pos_ptr
[0] = a1_pos_ptr
[0]
666 pre_p
= a1_pos_ptr
[0]
667 # call for the next p1, v1, p2, v2.
675 ret_pos
.resize
( I
, refcheck
=False
)
676 ret_v
.resize
( I
, refcheck
=False
)
677 return [ret_pos
, ret_v
]
679 cpdef naive_call_peaks
( list pv_array
, float min_v
, float max_v
= 1e30
, int max_gap
= 50, int min_length
= 200 ):
681 int peak_length
, pre_p
, p
, i
, summit
, tstart
, tend
682 long x
# index used for searching the first peak
683 double v
, summit_value
, tvalue
686 list peak_content
# (pre_p, p, v) for each region in the peak
687 list ret_peaks
= [] # returned peak summit and height
691 psn
= iter
(ps
).__next__
# assign the next function to a viable to speed up
692 vsn
= iter
(vs
).__next__
694 pre_p
= 0 # remember previous position
696 # find the first region above min_v
697 try: # try to read the first data range for this chrom
702 x
+= 1 # index for the next point
704 peak_content
= [ ( pre_p
, p
, v
), ]
706 break # found the first range above min_v
710 for i
in range( x
, len( ps
) ):
711 # continue scan the rest regions
714 if v
<= min_v
: # not be detected as 'peak'
717 # for points above min_v
718 # if the gap is allowed
719 # gap = pre_p - peak_content[-1][1] or the dist between pre_p and the last p
720 if pre_p
- peak_content
[ -1 ][ 1 ] <= max_gap
:
721 peak_content
.append
( ( pre_p
, p
, v
) )
723 # when the gap is not allowed, close this peak IF length is larger than min_length
724 if peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ] >= min_length
:
725 __close_peak
( peak_content
, ret_peaks
, max_v
, min_length
)
726 # reset and start a new peak
727 peak_content
= [ ( pre_p
, p
, v
), ]
732 if peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ] >= min_length
:
733 __close_peak
( peak_content
, ret_peaks
, max_v
, min_length
)
736 cdef void __close_peak
( peak_content
, peaks
, float max_v
, int min_length
):
737 """Internal function to find the summit and height
739 If the height is larger than max_v, skip
744 for (tstart
,tend
,tvalue
) in peak_content
:
745 if not summit_value
or summit_value
< tvalue
:
746 tsummit
= [int((tend
+tstart
)/2),]
747 summit_value
= tvalue
748 elif summit_value
== tvalue
:
749 tsummit
.append
( int((tend
+tstart
)/2) )
750 summit
= tsummit
[int((len(tsummit
)+1)/2)-1 ]
751 if summit_value
< max_v
:
752 peaks
.append
( (summit
, summit_value
) )