Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / FixWidthTrack.pyx
blob077d6324e8e92b07236c4785e658ae113aefc99f
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2022-09-15 17:17:37 Tao Liu>
5 """Module for FWTrack classes.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
9 the distribution).
10 """
12 # ------------------------------------
13 # python modules
14 # ------------------------------------
15 import sys
16 import io
17 from copy import copy
18 from collections import Counter
20 # ------------------------------------
21 # MACS3 modules
22 # ------------------------------------
24 from MACS3.Utilities.Constants import *
25 from MACS3.Signal.SignalProcessing import *
26 from MACS3.IO.PeakIO import PeakIO
27 from MACS3.Signal.Pileup import se_all_in_one_pileup, over_two_pv_array
29 # ------------------------------------
30 # Other modules
31 # ------------------------------------
32 from cpython cimport bool
33 cimport cython
34 import numpy as np
35 cimport numpy as np
36 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
38 # ------------------------------------
39 # constants
40 # ------------------------------------
41 __version__ = "FixWidthTrack $Revision$"
42 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
43 __doc__ = "FWTrack class"
45 cdef INT_MAX = <int32_t>((<uint32_t>(-1))>>1)
47 # ------------------------------------
48 # Misc functions
49 # ------------------------------------
51 # ------------------------------------
52 # Classes
53 # ------------------------------------
55 cdef class FWTrack:
56 """Fixed Width Locations Track class along the whole genome
57 (commonly with the same annotation type), which are stored in a
58 dict.
60 Locations are stored and organized by sequence names (chr names) in a
61 dict. They can be sorted by calling self.sort() function.
62 """
63 cdef:
64 dict __locations
65 dict __pointer
66 dict __buf_size
67 bool __sorted
68 bool __destroyed
69 dict rlengths
70 public int64_t buffer_size
71 public int64_t total
72 public object annotation
73 public object dups
74 public int32_t fw
75 public int64_t length
77 def __init__ (self, int32_t fw=0, char * anno="", int64_t buffer_size = 100000 ):
78 """fw is the fixed-width for all locations.
80 """
81 self.fw = fw
82 self.__locations = {} # location pairs: two strands
83 self.__pointer = {} # location pairs
84 self.__buf_size = {} # location pairs
85 self.__sorted = False
86 self.total = 0 # total tags
87 self.annotation = anno # need to be figured out
88 self.rlengths = {} # lengths of reference sequences, e.g. each chromosome in a genome
89 self.buffer_size = buffer_size
90 self.length = 0
91 self.__destroyed = False
93 cpdef void destroy ( self ):
94 """Destroy this object and release mem.
95 """
96 cdef:
97 set chrs
98 bytes chromosome
100 chrs = self.get_chr_names()
101 for chromosome in sorted(chrs):
102 if chromosome in self.__locations:
103 self.__locations[chromosome][0].resize( self.buffer_size, refcheck=False )
104 self.__locations[chromosome][0].resize( 0, refcheck=False )
105 self.__locations[chromosome][1].resize( self.buffer_size, refcheck=False )
106 self.__locations[chromosome][1].resize( 0, refcheck=False )
107 self.__locations[chromosome] = [None, None]
108 self.__locations.pop(chromosome)
109 self.__destroyed = True
110 return
112 cpdef void add_loc ( self, bytes chromosome, int32_t fiveendpos, int32_t strand ):
113 """Add a location to the list according to the sequence name.
115 chromosome -- mostly the chromosome name
116 fiveendpos -- 5' end pos, left for plus strand, right for minus strand
117 strand -- 0: plus, 1: minus
119 cdef:
120 int32_t i
121 int32_t b
122 np.ndarray arr
124 if chromosome not in self.__locations:
125 self.__buf_size[chromosome] = [ self.buffer_size, self.buffer_size ]
126 self.__locations[chromosome] = [ np.zeros(self.buffer_size, dtype='int32'), np.zeros(self.buffer_size, dtype='int32') ] # [plus,minus strand]
127 self.__pointer[chromosome] = [ 0, 0 ]
128 self.__locations[chromosome][strand][0] = fiveendpos
129 self.__pointer[chromosome][strand] = 1
130 else:
131 i = self.__pointer[chromosome][strand]
132 b = self.__buf_size[chromosome][strand]
133 arr = self.__locations[chromosome][strand]
134 if b == i:
135 b += self.buffer_size
136 arr.resize( b, refcheck = False )
137 self.__buf_size[chromosome][strand] = b
138 arr[i]= fiveendpos
139 self.__pointer[chromosome][strand] += 1
140 return
142 cpdef void finalize ( self ):
143 """ Resize np arrays for 5' positions and sort them in place
145 Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
148 cdef:
149 int32_t i
150 bytes c
151 set chrnames
153 self.total = 0
155 chrnames = self.get_chr_names()
157 for c in chrnames:
158 self.__locations[c][0].resize( self.__pointer[c][0], refcheck=False )
159 self.__locations[c][0].sort()
160 self.__locations[c][1].resize( self.__pointer[c][1], refcheck=False )
161 self.__locations[c][1].sort()
162 self.total += self.__locations[c][0].size + self.__locations[c][1].size
164 self.__sorted = True
165 self.length = self.fw * self.total
166 return
168 cpdef bint set_rlengths ( self, dict rlengths ):
169 """Set reference chromosome lengths dictionary.
171 Only the chromosome existing in this fwtrack object will be updated.
173 If chromosome in this fwtrack is not covered by given
174 rlengths, and it has no associated length, it will be set as
175 maximum integer.
178 cdef:
179 set valid_chroms, missed_chroms, extra_chroms
180 bytes chrom
182 valid_chroms = set(self.__locations.keys()).intersection(rlengths.keys())
183 for chrom in sorted(valid_chroms):
184 self.rlengths[chrom] = rlengths[chrom]
185 missed_chroms = set(self.__locations.keys()).difference(rlengths.keys())
186 for chrom in sorted(missed_chroms):
187 self.rlengths[chrom] = INT_MAX
188 return True
190 cpdef dict get_rlengths ( self ):
191 """Get reference chromosome lengths dictionary.
193 If self.rlength is empty, create a new dict where the length of
194 chromosome will be set as the maximum integer.
196 if not self.rlengths:
197 self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()])
198 return self.rlengths
200 cpdef get_locations_by_chr ( self, bytes chromosome ):
201 """Return a tuple of two lists of locations for certain chromosome.
204 if chromosome in self.__locations:
205 return self.__locations[chromosome]
206 else:
207 raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
209 cpdef set get_chr_names ( self ):
210 """Return all the chromosome names stored in this track object.
212 return set(sorted(self.__locations.keys()))
214 cpdef void sort ( self ):
215 """Naive sorting for locations.
218 cdef:
219 int32_t i
220 bytes c
221 set chrnames
223 chrnames = self.get_chr_names()
225 for c in chrnames:
226 self.__locations[c][0].sort()
227 self.__locations[c][1].sort()
229 self.__sorted = True
230 return
232 @cython.boundscheck(False) # do not check that np indices are valid
233 cpdef uint64_t filter_dup ( self, int32_t maxnum = -1):
234 """Filter the duplicated reads.
236 Run it right after you add all data into this object.
238 Note, this function will *throw out* duplicates
239 permenantly. If you want to keep them, use separate_dups
240 instead.
242 cdef:
243 int32_t p, m, n, current_loc
244 # index for old array, and index for new one
245 uint64_t i_old, i_new, size, new_size
246 bytes k
247 np.ndarray[int32_t, ndim=1] plus, new_plus, minus, new_minus
248 set chrnames
250 if maxnum < 0: return self.total # do nothing
252 if not self.__sorted:
253 self.sort()
255 self.total = 0
256 self.length = 0
258 chrnames = self.get_chr_names()
260 for k in chrnames:
261 # for each chromosome.
262 # This loop body is too big, I may need to split code later...
264 # + strand
265 i_new = 0
266 plus = self.__locations[k][0]
267 size = plus.shape[0]
268 if len(plus) <= 1:
269 new_plus = plus # do nothing
270 else:
271 new_plus = np.zeros( self.__pointer[k][0] + 1,dtype='int32' )
272 new_plus[ i_new ] = plus[ i_new ] # first item
273 i_new += 1
274 n = 1 # the number of tags in the current location
275 current_loc = plus[0]
276 for i_old in range( 1, size ):
277 p = plus[ i_old ]
278 if p == current_loc:
279 n += 1
280 else:
281 current_loc = p
282 n = 1
283 if n <= maxnum:
284 new_plus[ i_new ] = p
285 i_new += 1
286 new_plus.resize( i_new, refcheck=False )
287 self.total += i_new
288 self.__pointer[k][0] = i_new
289 # free memory?
290 # I know I should shrink it to 0 size directly,
291 # however, on Mac OSX, it seems directly assigning 0
292 # doesn't do a thing.
293 plus.resize( self.buffer_size, refcheck=False )
294 plus.resize( 0, refcheck=False )
295 # hope there would be no mem leak...
297 # - strand
298 i_new = 0
299 minus = self.__locations[k][1]
300 size = minus.shape[0]
301 if len(minus) <= 1:
302 new_minus = minus # do nothing
303 else:
304 new_minus = np.zeros( self.__pointer[k][1] + 1,dtype='int32' )
305 new_minus[ i_new ] = minus[ i_new ] # first item
306 i_new += 1
307 n = 1 # the number of tags in the current location
308 current_loc = minus[0]
309 for i_old in range( 1, size ):
310 p = minus[ i_old ]
311 if p == current_loc:
312 n += 1
313 else:
314 current_loc = p
315 n = 1
316 if n <= maxnum:
317 new_minus[ i_new ] = p
318 i_new += 1
319 new_minus.resize( i_new, refcheck=False )
320 self.total += i_new
321 self.__pointer[k][1] = i_new
322 # free memory ?
323 # I know I should shrink it to 0 size directly,
324 # however, on Mac OSX, it seems directly assigning 0
325 # doesn't do a thing.
326 minus.resize( self.buffer_size, refcheck=False )
327 minus.resize( 0, refcheck=False )
328 # hope there would be no mem leak...
330 self.__locations[k]=[new_plus,new_minus]
332 self.length = self.fw * self.total
333 return self.total
335 cpdef void sample_percent (self, float32_t percent, int32_t seed = -1 ):
336 """Sample the tags for a given percentage.
338 Warning: the current object is changed!
340 cdef:
341 int32_t num, i_chrom # num: number of reads allowed on a certain chromosome
342 bytes k
343 set chrnames
345 self.total = 0
346 self.length = 0
348 chrnames = self.get_chr_names()
350 if seed >= 0:
351 np.random.seed(seed)
353 for k in chrnames:
354 # for each chromosome.
355 # This loop body is too big, I may need to split code later...
357 num = <int32_t>round(self.__locations[k][0].shape[0] * percent, 5 )
358 np.random.shuffle( self.__locations[k][0] )
359 self.__locations[k][0].resize( num, refcheck=False )
360 self.__locations[k][0].sort()
361 self.__pointer[k][0] = self.__locations[k][0].shape[0]
363 num = <int32_t>round(self.__locations[k][1].shape[0] * percent, 5 )
364 np.random.shuffle( self.__locations[k][1] )
365 self.__locations[k][1].resize( num, refcheck=False )
366 self.__locations[k][1].sort()
367 self.__pointer[k][1] = self.__locations[k][1].shape[0]
369 self.total += self.__pointer[k][0] + self.__pointer[k][1]
371 self.length = self.fw * self.total
372 return
374 cpdef void sample_num (self, uint64_t samplesize, int32_t seed = -1):
375 """Sample the tags for a given percentage.
377 Warning: the current object is changed!
379 cdef:
380 float32_t percent
382 percent = <float32_t>(samplesize)/self.total
383 self.sample_percent ( percent, seed )
384 return
386 cpdef void print_to_bed (self, fhd=None):
387 """Output FWTrack to BED format files. If fhd is given,
388 write to a file, otherwise, output to standard output.
391 cdef:
392 int32_t i, i_chrom, p
393 bytes k
394 set chrnames
396 if not fhd:
397 fhd = sys.stdout
398 assert isinstance(fhd,io.IOBase)
399 assert self.fw > 0, "FWTrack object .fw should be set larger than 0!"
401 chrnames = self.get_chr_names()
403 for k in chrnames:
404 # for each chromosome.
405 # This loop body is too big, I may need to split code later...
407 plus = self.__locations[k][0]
409 for i in range(plus.shape[0]):
410 p = plus[i]
411 fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),p,p+self.fw,"+") )
413 minus = self.__locations[k][1]
415 for i in range(minus.shape[0]):
416 p = minus[i]
417 fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),p-self.fw,p,"-") )
418 return
420 cpdef tuple extract_region_tags ( self, bytes chromosome, int32_t startpos, int32_t endpos ):
421 cdef:
422 int32_t i, pos
423 np.ndarray[int32_t, ndim=1] rt_plus, rt_minus
424 list temp
425 set chrnames
427 if not self.__sorted: self.sort()
429 chrnames = self.get_chr_names()
430 assert chromosome in chrnames, "chromosome %s can't be found in the FWTrack object." % chromosome
432 (plus, minus) = self.__locations[chromosome]
434 temp = []
435 for i in range(plus.shape[0]):
436 pos = plus[i]
437 if pos < startpos:
438 continue
439 elif pos > endpos:
440 break
441 else:
442 temp.append(pos)
443 rt_plus = np.array(temp)
445 temp = []
446 for i in range(minus.shape[0]):
447 pos = minus[i]
448 if pos < startpos:
449 continue
450 elif pos > endpos:
451 break
452 else:
453 temp.append(pos)
454 rt_minus = np.array(temp)
455 return (rt_plus, rt_minus)
457 cpdef list compute_region_tags_from_peaks ( self, peaks, func, int32_t window_size = 100, float32_t cutoff = 5 ):
458 """Extract tags in peak, then apply func on extracted tags.
460 peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
462 func: a function to compute *something* from tags found in a predefined region
464 window_size: this will be passed to func.
466 cutoff: this will be passed to func.
468 func needs the fixed number of parameters, so it's not flexible. Here is an example:
470 wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff):
474 cdef:
475 int32_t m, i, j, pre_i, pre_j, pos, startpos, endpos
476 np.ndarray[int32_t, ndim=1] plus, minus, rt_plus, rt_minus
477 bytes chrom, name
478 list temp, retval
479 set pchrnames, chrnames
481 pchrnames = peaks.get_chr_names()
482 retval = []
484 # this object should be sorted
485 if not self.__sorted: self.sort()
486 # PeakIO object should be sorted
487 peaks.sort()
489 chrnames = self.get_chr_names()
491 for chrom in sorted(pchrnames):
492 assert chrom in chrnames, "chromosome %s can't be found in the FWTrack object." % chrom
493 (plus, minus) = self.__locations[chrom]
494 cpeaks = peaks.get_data_from_chrom(chrom)
495 prev_i = 0
496 prev_j = 0
497 for m in range(len(cpeaks)):
498 startpos = cpeaks[m]["start"] - window_size
499 endpos = cpeaks[m]["end"] + window_size
500 name = cpeaks[m]["name"]
502 temp = []
503 for i in range(prev_i,plus.shape[0]):
504 pos = plus[i]
505 if pos < startpos:
506 continue
507 elif pos > endpos:
508 prev_i = i
509 break
510 else:
511 temp.append(pos)
512 rt_plus = np.array(temp, dtype="int32")
514 temp = []
515 for j in range(prev_j,minus.shape[0]):
516 pos = minus[j]
517 if pos < startpos:
518 continue
519 elif pos > endpos:
520 prev_j = j
521 break
522 else:
523 temp.append(pos)
524 rt_minus = np.array(temp, dtype="int32")
526 retval.append( func(chrom, rt_plus, rt_minus, startpos, endpos, name = name, window_size = window_size, cutoff = cutoff) )
527 # rewind window_size
528 for i in range(prev_i, 0, -1):
529 if plus[prev_i] - plus[i] >= window_size:
530 break
531 prev_i = i
533 for j in range(prev_j, 0, -1):
534 if minus[prev_j] - minus[j] >= window_size:
535 break
536 prev_j = j
537 # end of a loop
539 return retval
541 cpdef list pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float32_t baseline_value = 0.0, bint directional = True, int32_t end_shift = 0 ):
542 """pileup a certain chromosome, return [p,v] (end position and value) list.
544 ds : tag will be extended to this value to 3' direction,
545 unless directional is False. Can contain multiple extension
546 values. Final pileup will the maximum.
547 scale_factor_s : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
548 baseline_value : a value to be filled for missing values, and will be the minimum pileup.
549 directional : if False, the strand or direction of tag will be ignored, so that extension will be both sides with d/2.
550 end_shift : move cutting ends towards 5->3 direction if value is positive, or towards 3->5 direction if negative. Default is 0 -- no shift at all.
553 p and v are numpy.ndarray objects.
555 cdef:
556 int64_t d
557 int64_t five_shift, three_shift # adjustment to 5' end and 3' end positions to make a fragment
558 dict chrlengths = self.get_rlengths ()
559 int64_t rlength = chrlengths[chrom]
560 object ends
561 list five_shift_s = []
562 list three_shift_s = []
563 list tmp_pileup, prev_pileup
565 assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
567 # adjust extension length according to 'directional' and 'halfextension' setting.
568 for d in ds:
569 if directional:
570 # only extend to 3' side
571 five_shift_s.append( - end_shift )
572 three_shift_s.append( end_shift + d)
573 else:
574 # both sides
575 five_shift_s.append( d//2 - end_shift )
576 three_shift_s.append( end_shift + d - d//2)
578 prev_pileup = None
580 for i in range(len(ds)):
581 five_shift = five_shift_s[i]
582 three_shift = three_shift_s[i]
583 scale_factor = scale_factor_s[i]
584 tmp_pileup = se_all_in_one_pileup ( self.__locations[chrom][0], self.__locations[chrom][1], five_shift, three_shift, rlength, scale_factor, baseline_value )
586 if prev_pileup:
587 prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" )
588 else:
589 prev_pileup = tmp_pileup
591 return prev_pileup
593 cdef inline int32_t left_sum ( data, int32_t pos, int32_t width ):
596 return sum([data[x] for x in data if x <= pos and x >= pos - width])
598 cdef inline int32_t right_sum ( data, int32_t pos, int32_t width ):
601 return sum([data[x] for x in data if x >= pos and x <= pos + width])
603 cdef inline int32_t left_forward ( data, int32_t pos, int32_t window_size ):
604 return data.get(pos,0) - data.get(pos-window_size, 0)
606 cdef inline int32_t right_forward ( data, int32_t pos, int32_t window_size ):
607 return data.get(pos + window_size, 0) - data.get(pos, 0)