1 # cython: language_level=3
3 # Time-stamp: <2022-09-15 17:17:37 Tao Liu>
5 """Module for FWTrack classes.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
18 from collections
import Counter
20 # ------------------------------------
22 # ------------------------------------
24 from MACS3
.Utilities
.Constants
import *
25 from MACS3
.Signal
.SignalProcessing
import *
26 from MACS3
.IO
.PeakIO
import PeakIO
27 from MACS3
.Signal
.Pileup
import se_all_in_one_pileup
, over_two_pv_array
29 # ------------------------------------
31 # ------------------------------------
32 from cpython cimport bool
36 from numpy cimport uint8_t
, uint16_t
, uint32_t
, uint64_t
, int8_t
, int16_t
, int32_t
, int64_t
, float32_t
, float64_t
38 # ------------------------------------
40 # ------------------------------------
41 __version__
= "FixWidthTrack $Revision$"
42 __author__
= "Tao Liu <taoliu@jimmy.harvard.edu>"
43 __doc__
= "FWTrack class"
45 cdef INT_MAX
= <int32_t
>((<uint32_t
>(-1))>>1)
47 # ------------------------------------
49 # ------------------------------------
51 # ------------------------------------
53 # ------------------------------------
56 """Fixed Width Locations Track class along the whole genome
57 (commonly with the same annotation type), which are stored in a
60 Locations are stored and organized by sequence names (chr names) in a
61 dict. They can be sorted by calling self.sort() function.
70 public int64_t buffer_size
72 public object annotation
77 def __init__
(self, int32_t fw
=0, char * anno
="", int64_t buffer_size
= 100000 ):
78 """fw is the fixed-width for all locations.
82 self.__locations
= {} # location pairs: two strands
83 self.__pointer
= {} # location pairs
84 self.__buf_size
= {} # location pairs
86 self.total
= 0 # total tags
87 self.annotation
= anno
# need to be figured out
88 self.rlengths
= {} # lengths of reference sequences, e.g. each chromosome in a genome
89 self.buffer_size
= buffer_size
91 self.__destroyed
= False
93 cpdef
void destroy
( self ):
94 """Destroy this object and release mem.
100 chrs
= self.get_chr_names
()
101 for chromosome
in sorted
(chrs
):
102 if chromosome
in self.__locations
:
103 self.__locations
[chromosome
][0].resize
( self.buffer_size
, refcheck
=False
)
104 self.__locations
[chromosome
][0].resize
( 0, refcheck
=False
)
105 self.__locations
[chromosome
][1].resize
( self.buffer_size
, refcheck
=False
)
106 self.__locations
[chromosome
][1].resize
( 0, refcheck
=False
)
107 self.__locations
[chromosome
] = [None
, None
]
108 self.__locations
.pop
(chromosome
)
109 self.__destroyed
= True
112 cpdef
void add_loc
( self, bytes chromosome
, int32_t fiveendpos
, int32_t strand
):
113 """Add a location to the list according to the sequence name.
115 chromosome -- mostly the chromosome name
116 fiveendpos -- 5' end pos, left for plus strand, right for minus strand
117 strand -- 0: plus, 1: minus
124 if chromosome
not in self.__locations
:
125 self.__buf_size
[chromosome
] = [ self.buffer_size
, self.buffer_size
]
126 self.__locations
[chromosome
] = [ np
.zeros
(self.buffer_size
, dtype
='int32'), np
.zeros
(self.buffer_size
, dtype
='int32') ] # [plus,minus strand]
127 self.__pointer
[chromosome
] = [ 0, 0 ]
128 self.__locations
[chromosome
][strand
][0] = fiveendpos
129 self.__pointer
[chromosome
][strand
] = 1
131 i
= self.__pointer
[chromosome
][strand
]
132 b
= self.__buf_size
[chromosome
][strand
]
133 arr
= self.__locations
[chromosome
][strand
]
135 b
+= self.buffer_size
136 arr
.resize
( b
, refcheck
= False
)
137 self.__buf_size
[chromosome
][strand
] = b
139 self.__pointer
[chromosome
][strand
] += 1
142 cpdef
void finalize
( self ):
143 """ Resize np arrays for 5' positions and sort them in place
145 Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
155 chrnames
= self.get_chr_names
()
158 self.__locations
[c
][0].resize
( self.__pointer
[c
][0], refcheck
=False
)
159 self.__locations
[c
][0].sort
()
160 self.__locations
[c
][1].resize
( self.__pointer
[c
][1], refcheck
=False
)
161 self.__locations
[c
][1].sort
()
162 self.total
+= self.__locations
[c
][0].size
+ self.__locations
[c
][1].size
165 self.length
= self.fw
* self.total
168 cpdef bint set_rlengths
( self, dict rlengths
):
169 """Set reference chromosome lengths dictionary.
171 Only the chromosome existing in this fwtrack object will be updated.
173 If chromosome in this fwtrack is not covered by given
174 rlengths, and it has no associated length, it will be set as
179 set valid_chroms
, missed_chroms
, extra_chroms
182 valid_chroms
= set
(self.__locations
.keys
()).intersection
(rlengths
.keys
())
183 for chrom
in sorted
(valid_chroms
):
184 self.rlengths
[chrom
] = rlengths
[chrom
]
185 missed_chroms
= set
(self.__locations
.keys
()).difference
(rlengths
.keys
())
186 for chrom
in sorted
(missed_chroms
):
187 self.rlengths
[chrom
] = INT_MAX
190 cpdef dict get_rlengths
( self ):
191 """Get reference chromosome lengths dictionary.
193 If self.rlength is empty, create a new dict where the length of
194 chromosome will be set as the maximum integer.
196 if not self.rlengths
:
197 self.rlengths
= dict
([(k
, INT_MAX
) for k
in self.__locations
.keys
()])
200 cpdef get_locations_by_chr
( self, bytes chromosome
):
201 """Return a tuple of two lists of locations for certain chromosome.
204 if chromosome
in self.__locations
:
205 return self.__locations
[chromosome
]
207 raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome
))
209 cpdef set get_chr_names
( self ):
210 """Return all the chromosome names stored in this track object.
212 return set
(sorted
(self.__locations
.keys
()))
214 cpdef
void sort
( self ):
215 """Naive sorting for locations.
223 chrnames
= self.get_chr_names
()
226 self.__locations
[c
][0].sort
()
227 self.__locations
[c
][1].sort
()
232 @cython
.boundscheck
(False
) # do not check that np indices are valid
233 cpdef uint64_t filter_dup
( self, int32_t maxnum
= -1):
234 """Filter the duplicated reads.
236 Run it right after you add all data into this object.
238 Note, this function will *throw out* duplicates
239 permenantly. If you want to keep them, use separate_dups
243 int32_t p
, m
, n
, current_loc
244 # index for old array, and index for new one
245 uint64_t i_old
, i_new
, size
, new_size
247 np
.ndarray
[int32_t
, ndim
=1] plus
, new_plus
, minus
, new_minus
250 if maxnum
< 0: return self.total
# do nothing
252 if not self.__sorted
:
258 chrnames
= self.get_chr_names
()
261 # for each chromosome.
262 # This loop body is too big, I may need to split code later...
266 plus
= self.__locations
[k
][0]
269 new_plus
= plus
# do nothing
271 new_plus
= np
.zeros
( self.__pointer
[k
][0] + 1,dtype
='int32' )
272 new_plus
[ i_new
] = plus
[ i_new
] # first item
274 n
= 1 # the number of tags in the current location
275 current_loc
= plus
[0]
276 for i_old
in range( 1, size
):
284 new_plus
[ i_new
] = p
286 new_plus
.resize
( i_new
, refcheck
=False
)
288 self.__pointer
[k
][0] = i_new
290 # I know I should shrink it to 0 size directly,
291 # however, on Mac OSX, it seems directly assigning 0
292 # doesn't do a thing.
293 plus
.resize
( self.buffer_size
, refcheck
=False
)
294 plus
.resize
( 0, refcheck
=False
)
295 # hope there would be no mem leak...
299 minus
= self.__locations
[k
][1]
300 size
= minus
.shape
[0]
302 new_minus
= minus
# do nothing
304 new_minus
= np
.zeros
( self.__pointer
[k
][1] + 1,dtype
='int32' )
305 new_minus
[ i_new
] = minus
[ i_new
] # first item
307 n
= 1 # the number of tags in the current location
308 current_loc
= minus
[0]
309 for i_old
in range( 1, size
):
317 new_minus
[ i_new
] = p
319 new_minus
.resize
( i_new
, refcheck
=False
)
321 self.__pointer
[k
][1] = i_new
323 # I know I should shrink it to 0 size directly,
324 # however, on Mac OSX, it seems directly assigning 0
325 # doesn't do a thing.
326 minus
.resize
( self.buffer_size
, refcheck
=False
)
327 minus
.resize
( 0, refcheck
=False
)
328 # hope there would be no mem leak...
330 self.__locations
[k
]=[new_plus
,new_minus
]
332 self.length
= self.fw
* self.total
335 cpdef
void sample_percent
(self, float32_t percent
, int32_t seed
= -1 ):
336 """Sample the tags for a given percentage.
338 Warning: the current object is changed!
341 int32_t num
, i_chrom
# num: number of reads allowed on a certain chromosome
348 chrnames
= self.get_chr_names
()
354 # for each chromosome.
355 # This loop body is too big, I may need to split code later...
357 num
= <int32_t
>round(self.__locations
[k
][0].shape
[0] * percent
, 5 )
358 np
.random
.shuffle
( self.__locations
[k
][0] )
359 self.__locations
[k
][0].resize
( num
, refcheck
=False
)
360 self.__locations
[k
][0].sort
()
361 self.__pointer
[k
][0] = self.__locations
[k
][0].shape
[0]
363 num
= <int32_t
>round(self.__locations
[k
][1].shape
[0] * percent
, 5 )
364 np
.random
.shuffle
( self.__locations
[k
][1] )
365 self.__locations
[k
][1].resize
( num
, refcheck
=False
)
366 self.__locations
[k
][1].sort
()
367 self.__pointer
[k
][1] = self.__locations
[k
][1].shape
[0]
369 self.total
+= self.__pointer
[k
][0] + self.__pointer
[k
][1]
371 self.length
= self.fw
* self.total
374 cpdef
void sample_num
(self, uint64_t samplesize
, int32_t seed
= -1):
375 """Sample the tags for a given percentage.
377 Warning: the current object is changed!
382 percent
= <float32_t
>(samplesize
)/self.total
383 self.sample_percent
( percent
, seed
)
386 cpdef
void print_to_bed
(self, fhd
=None
):
387 """Output FWTrack to BED format files. If fhd is given,
388 write to a file, otherwise, output to standard output.
392 int32_t i
, i_chrom
, p
398 assert isinstance(fhd
,io
.IOBase
)
399 assert self.fw
> 0, "FWTrack object .fw should be set larger than 0!"
401 chrnames
= self.get_chr_names
()
404 # for each chromosome.
405 # This loop body is too big, I may need to split code later...
407 plus
= self.__locations
[k
][0]
409 for i
in range(plus
.shape
[0]):
411 fhd
.write
("%s\t%d\t%d\t.\t.\t%s\n" % (k
.decode
(),p
,p
+self.fw
,"+") )
413 minus
= self.__locations
[k
][1]
415 for i
in range(minus
.shape
[0]):
417 fhd
.write
("%s\t%d\t%d\t.\t.\t%s\n" % (k
.decode
(),p
-self.fw
,p
,"-") )
420 cpdef
tuple extract_region_tags
( self, bytes chromosome
, int32_t startpos
, int32_t endpos
):
423 np
.ndarray
[int32_t
, ndim
=1] rt_plus
, rt_minus
427 if not self.__sorted
: self.sort
()
429 chrnames
= self.get_chr_names
()
430 assert chromosome
in chrnames
, "chromosome %s can't be found in the FWTrack object." % chromosome
432 (plus
, minus
) = self.__locations
[chromosome
]
435 for i
in range(plus
.shape
[0]):
443 rt_plus
= np
.array
(temp
)
446 for i
in range(minus
.shape
[0]):
454 rt_minus
= np
.array
(temp
)
455 return (rt_plus
, rt_minus
)
457 cpdef
list compute_region_tags_from_peaks
( self, peaks
, func
, int32_t window_size
= 100, float32_t cutoff
= 5 ):
458 """Extract tags in peak, then apply func on extracted tags.
460 peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
462 func: a function to compute *something* from tags found in a predefined region
464 window_size: this will be passed to func.
466 cutoff: this will be passed to func.
468 func needs the fixed number of parameters, so it's not flexible. Here is an example:
470 wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff):
475 int32_t m
, i
, j
, pre_i
, pre_j
, pos
, startpos
, endpos
476 np
.ndarray
[int32_t
, ndim
=1] plus
, minus
, rt_plus
, rt_minus
479 set pchrnames
, chrnames
481 pchrnames
= peaks
.get_chr_names
()
484 # this object should be sorted
485 if not self.__sorted
: self.sort
()
486 # PeakIO object should be sorted
489 chrnames
= self.get_chr_names
()
491 for chrom
in sorted
(pchrnames
):
492 assert chrom
in chrnames
, "chromosome %s can't be found in the FWTrack object." % chrom
493 (plus
, minus
) = self.__locations
[chrom
]
494 cpeaks
= peaks
.get_data_from_chrom
(chrom
)
497 for m
in range(len(cpeaks
)):
498 startpos
= cpeaks
[m
]["start"] - window_size
499 endpos
= cpeaks
[m
]["end"] + window_size
500 name
= cpeaks
[m
]["name"]
503 for i
in range(prev_i
,plus
.shape
[0]):
512 rt_plus
= np
.array
(temp
, dtype
="int32")
515 for j
in range(prev_j
,minus
.shape
[0]):
524 rt_minus
= np
.array
(temp
, dtype
="int32")
526 retval
.append
( func
(chrom
, rt_plus
, rt_minus
, startpos
, endpos
, name
= name
, window_size
= window_size
, cutoff
= cutoff
) )
528 for i
in range(prev_i
, 0, -1):
529 if plus
[prev_i
] - plus
[i
] >= window_size
:
533 for j
in range(prev_j
, 0, -1):
534 if minus
[prev_j
] - minus
[j
] >= window_size
:
541 cpdef
list pileup_a_chromosome
( self, bytes chrom
, list ds
, list scale_factor_s
, float32_t baseline_value
= 0.0, bint directional
= True
, int32_t end_shift
= 0 ):
542 """pileup a certain chromosome, return [p,v] (end position and value) list.
544 ds : tag will be extended to this value to 3' direction,
545 unless directional is False. Can contain multiple extension
546 values. Final pileup will the maximum.
547 scale_factor_s : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
548 baseline_value : a value to be filled for missing values, and will be the minimum pileup.
549 directional : if False, the strand or direction of tag will be ignored, so that extension will be both sides with d/2.
550 end_shift : move cutting ends towards 5->3 direction if value is positive, or towards 3->5 direction if negative. Default is 0 -- no shift at all.
553 p and v are numpy.ndarray objects.
557 int64_t five_shift
, three_shift
# adjustment to 5' end and 3' end positions to make a fragment
558 dict chrlengths
= self.get_rlengths
()
559 int64_t rlength
= chrlengths
[chrom
]
561 list five_shift_s
= []
562 list three_shift_s
= []
563 list tmp_pileup
, prev_pileup
565 assert len(ds
) == len(scale_factor_s
), "ds and scale_factor_s must have the same length!"
567 # adjust extension length according to 'directional' and 'halfextension' setting.
570 # only extend to 3' side
571 five_shift_s
.append
( - end_shift
)
572 three_shift_s
.append
( end_shift
+ d
)
575 five_shift_s
.append
( d
//2 - end_shift
)
576 three_shift_s
.append
( end_shift
+ d
- d
//2)
580 for i
in range(len(ds
)):
581 five_shift
= five_shift_s
[i
]
582 three_shift
= three_shift_s
[i
]
583 scale_factor
= scale_factor_s
[i
]
584 tmp_pileup
= se_all_in_one_pileup
( self.__locations
[chrom
][0], self.__locations
[chrom
][1], five_shift
, three_shift
, rlength
, scale_factor
, baseline_value
)
587 prev_pileup
= over_two_pv_array
( prev_pileup
, tmp_pileup
, func
="max" )
589 prev_pileup
= tmp_pileup
593 cdef inline int32_t left_sum
( data
, int32_t pos
, int32_t width
):
596 return sum
([data
[x
] for x
in data
if x
<= pos
and x
>= pos
- width
])
598 cdef inline int32_t right_sum
( data
, int32_t pos
, int32_t width
):
601 return sum
([data
[x
] for x
in data
if x
>= pos
and x
<= pos
+ width
])
603 cdef inline int32_t left_forward
( data
, int32_t pos
, int32_t window_size
):
604 return data
.get
(pos
,0) - data
.get
(pos
-window_size
, 0)
606 cdef inline int32_t right_forward
( data
, int32_t pos
, int32_t window_size
):
607 return data
.get
(pos
+ window_size
, 0) - data
.get
(pos
, 0)