MACS3/Signal/FixWidthTrack.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2022-09-15 17:17:37 Tao Liu>
   4
   5 """Module for FWTrack classes.
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file LICENSE included with
   9 the distribution).
  10 """
  11
  12 # ------------------------------------
  13 # python modules
  14 # ------------------------------------
  15 import sys
  16 import io
  17 from copy import copy
  18 from collections import Counter
  19
  20 # ------------------------------------
  21 # MACS3 modules
  22 # ------------------------------------
  23
  24 from MACS3.Utilities.Constants import *
  25 from MACS3.Signal.SignalProcessing import *
  26 from MACS3.IO.PeakIO import PeakIO
  27 from MACS3.Signal.Pileup import se_all_in_one_pileup, over_two_pv_array
  28
  29 # ------------------------------------
  30 # Other modules
  31 # ------------------------------------
  32 from cpython cimport bool
  33 cimport cython
  34 import numpy as np
  35 cimport numpy as np
  36 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
  37
  38 # ------------------------------------
  39 # constants
  40 # ------------------------------------
  41 __version__ = "FixWidthTrack $Revision$"
  42 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
  43 __doc__ = "FWTrack class"
  44
  45 cdef INT_MAX = <int32_t>((<uint32_t>(-1))>>1)
  46
  47 # ------------------------------------
  48 # Misc functions
  49 # ------------------------------------
  50
  51 # ------------------------------------
  52 # Classes
  53 # ------------------------------------
  54
  55 cdef class FWTrack:
  56     """Fixed Width Locations Track class  along the whole genome
  57     (commonly with the same annotation type), which are stored in a
  58     dict.
  59
  60     Locations are stored and organized by sequence names (chr names) in a
  61     dict. They can be sorted by calling self.sort() function.
  62     """
  63     cdef:
  64         dict __locations
  65         dict __pointer
  66         dict __buf_size
  67         bool __sorted
  68         bool __destroyed
  69         dict rlengths
  70         public int64_t buffer_size
  71         public int64_t total
  72         public object annotation
  73         public object dups
  74         public int32_t fw
  75         public int64_t length
  76
  77     def __init__ (self, int32_t fw=0, char * anno="", int64_t buffer_size = 100000 ):
  78         """fw is the fixed-width for all locations.
  79
  80         """
  81         self.fw = fw
  82         self.__locations = {}    # location pairs: two strands
  83         self.__pointer = {}      # location pairs
  84         self.__buf_size = {}     # location pairs
  85         self.__sorted = False
  86         self.total = 0           # total tags
  87         self.annotation = anno   # need to be figured out
  88         self.rlengths = {}       # lengths of reference sequences, e.g. each chromosome in a genome
  89         self.buffer_size = buffer_size
  90         self.length = 0
  91         self.__destroyed = False
  92
  93     cpdef void destroy ( self ):
  94         """Destroy this object and release mem.
  95         """
  96         cdef:
  97             set chrs
  98             bytes chromosome
  99
 100         chrs = self.get_chr_names()
 101         for chromosome in sorted(chrs):
 102             if chromosome in self.__locations:
 103                 self.__locations[chromosome][0].resize( self.buffer_size, refcheck=False )
 104                 self.__locations[chromosome][0].resize( 0, refcheck=False )
 105                 self.__locations[chromosome][1].resize( self.buffer_size, refcheck=False )
 106                 self.__locations[chromosome][1].resize( 0, refcheck=False )
 107                 self.__locations[chromosome] = [None, None]
 108                 self.__locations.pop(chromosome)
 109         self.__destroyed = True
 110         return
 111
 112     cpdef void add_loc ( self, bytes chromosome, int32_t fiveendpos, int32_t strand ):
 113         """Add a location to the list according to the sequence name.
 114
 115         chromosome -- mostly the chromosome name
 116         fiveendpos -- 5' end pos, left for plus strand, right for minus strand
 117         strand     -- 0: plus, 1: minus
 118         """
 119         cdef:
 120             int32_t i
 121             int32_t b
 122             np.ndarray arr
 123
 124         if chromosome not in self.__locations:
 125             self.__buf_size[chromosome] = [ self.buffer_size, self.buffer_size ]
 126             self.__locations[chromosome] = [ np.zeros(self.buffer_size, dtype='int32'), np.zeros(self.buffer_size, dtype='int32') ] # [plus,minus strand]
 127             self.__pointer[chromosome] = [ 0, 0 ]
 128             self.__locations[chromosome][strand][0] = fiveendpos
 129             self.__pointer[chromosome][strand] = 1
 130         else:
 131             i = self.__pointer[chromosome][strand]
 132             b = self.__buf_size[chromosome][strand]
 133             arr = self.__locations[chromosome][strand]
 134             if b == i:
 135                 b += self.buffer_size
 136                 arr.resize( b, refcheck = False )
 137                 self.__buf_size[chromosome][strand] = b
 138             arr[i]= fiveendpos
 139             self.__pointer[chromosome][strand] += 1
 140         return
 141
 142     cpdef void finalize ( self ):
 143         """ Resize np arrays for 5' positions and sort them in place
 144
 145         Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
 146         """
 147
 148         cdef:
 149             int32_t i
 150             bytes c
 151             set chrnames
 152
 153         self.total = 0
 154
 155         chrnames = self.get_chr_names()
 156
 157         for c in chrnames:
 158             self.__locations[c][0].resize( self.__pointer[c][0], refcheck=False )
 159             self.__locations[c][0].sort()
 160             self.__locations[c][1].resize( self.__pointer[c][1], refcheck=False )
 161             self.__locations[c][1].sort()
 162             self.total += self.__locations[c][0].size + self.__locations[c][1].size
 163
 164         self.__sorted = True
 165         self.length = self.fw * self.total
 166         return
 167
 168     cpdef bint set_rlengths ( self, dict rlengths ):
 169         """Set reference chromosome lengths dictionary.
 170
 171         Only the chromosome existing in this fwtrack object will be updated.
 172
 173         If chromosome in this fwtrack is not covered by given
 174         rlengths, and it has no associated length, it will be set as
 175         maximum integer.
 176
 177         """
 178         cdef:
 179             set valid_chroms, missed_chroms, extra_chroms
 180             bytes chrom
 181
 182         valid_chroms = set(self.__locations.keys()).intersection(rlengths.keys())
 183         for chrom in sorted(valid_chroms):
 184             self.rlengths[chrom] = rlengths[chrom]
 185         missed_chroms = set(self.__locations.keys()).difference(rlengths.keys())
 186         for chrom in sorted(missed_chroms):
 187             self.rlengths[chrom] = INT_MAX
 188         return True
 189
 190     cpdef dict get_rlengths ( self ):
 191         """Get reference chromosome lengths dictionary.
 192
 193         If self.rlength is empty, create a new dict where the length of
 194         chromosome will be set as the maximum integer.
 195         """
 196         if not self.rlengths:
 197             self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()])
 198         return self.rlengths
 199
 200     cpdef get_locations_by_chr ( self, bytes chromosome ):
 201         """Return a tuple of two lists of locations for certain chromosome.
 202
 203         """
 204         if chromosome in self.__locations:
 205             return self.__locations[chromosome]
 206         else:
 207             raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
 208
 209     cpdef set get_chr_names ( self ):
 210         """Return all the chromosome names stored in this track object.
 211         """
 212         return set(sorted(self.__locations.keys()))
 213
 214     cpdef void sort ( self ):
 215         """Naive sorting for locations.
 216
 217         """
 218         cdef:
 219             int32_t i
 220             bytes c
 221             set chrnames
 222
 223         chrnames = self.get_chr_names()
 224
 225         for c in chrnames:
 226             self.__locations[c][0].sort()
 227             self.__locations[c][1].sort()
 228
 229         self.__sorted = True
 230         return
 231
 232     @cython.boundscheck(False) # do not check that np indices are valid
 233     cpdef uint64_t filter_dup ( self, int32_t maxnum = -1):
 234         """Filter the duplicated reads.
 235
 236         Run it right after you add all data into this object.
 237
 238         Note, this function will *throw out* duplicates
 239         permenantly. If you want to keep them, use separate_dups
 240         instead.
 241         """
 242         cdef:
 243             int32_t p, m, n, current_loc
 244             # index for old array, and index for new one
 245             uint64_t i_old, i_new, size, new_size
 246             bytes k
 247             np.ndarray[int32_t, ndim=1] plus, new_plus, minus, new_minus
 248             set chrnames
 249
 250         if maxnum < 0: return self.total         # do nothing
 251
 252         if not self.__sorted:
 253             self.sort()
 254
 255         self.total = 0
 256         self.length = 0
 257
 258         chrnames = self.get_chr_names()
 259
 260         for k in chrnames:
 261             # for each chromosome.
 262             # This loop body is too big, I may need to split code later...
 263
 264             # + strand
 265             i_new = 0
 266             plus = self.__locations[k][0]
 267             size = plus.shape[0]
 268             if len(plus) <= 1:
 269                 new_plus = plus         # do nothing
 270             else:
 271                 new_plus = np.zeros( self.__pointer[k][0] + 1,dtype='int32' )
 272                 new_plus[ i_new ] = plus[ i_new ] # first item
 273                 i_new += 1
 274                 n = 1                # the number of tags in the current location
 275                 current_loc = plus[0]
 276                 for i_old in range( 1, size ):
 277                     p = plus[ i_old ]
 278                     if p == current_loc:
 279                         n += 1
 280                     else:
 281                         current_loc = p
 282                         n = 1
 283                     if n <= maxnum:
 284                         new_plus[ i_new ] = p
 285                         i_new += 1
 286                 new_plus.resize( i_new, refcheck=False )
 287                 self.total +=  i_new
 288                 self.__pointer[k][0] = i_new
 289                 # free memory?
 290                 # I know I should shrink it to 0 size directly,
 291                 # however, on Mac OSX, it seems directly assigning 0
 292                 # doesn't do a thing.
 293                 plus.resize( self.buffer_size, refcheck=False )
 294                 plus.resize( 0, refcheck=False )
 295                 # hope there would be no mem leak...
 296
 297             # - strand
 298             i_new = 0
 299             minus = self.__locations[k][1]
 300             size = minus.shape[0]
 301             if len(minus) <= 1:
 302                 new_minus = minus         # do nothing
 303             else:
 304                 new_minus = np.zeros( self.__pointer[k][1] + 1,dtype='int32' )
 305                 new_minus[ i_new ] = minus[ i_new ] # first item
 306                 i_new += 1
 307                 n = 1                # the number of tags in the current location
 308                 current_loc = minus[0]
 309                 for i_old in range( 1, size ):
 310                     p = minus[ i_old ]
 311                     if p == current_loc:
 312                         n += 1
 313                     else:
 314                         current_loc = p
 315                         n = 1
 316                     if n <= maxnum:
 317                         new_minus[ i_new ] = p
 318                         i_new += 1
 319                 new_minus.resize( i_new, refcheck=False )
 320                 self.total +=  i_new
 321                 self.__pointer[k][1] = i_new
 322                 # free memory ?
 323                 # I know I should shrink it to 0 size directly,
 324                 # however, on Mac OSX, it seems directly assigning 0
 325                 # doesn't do a thing.
 326                 minus.resize( self.buffer_size, refcheck=False )
 327                 minus.resize( 0, refcheck=False )
 328                 # hope there would be no mem leak...
 329
 330             self.__locations[k]=[new_plus,new_minus]
 331
 332         self.length = self.fw * self.total
 333         return self.total
 334
 335     cpdef void sample_percent (self, float32_t percent, int32_t seed = -1 ):
 336         """Sample the tags for a given percentage.
 337
 338         Warning: the current object is changed!
 339         """
 340         cdef:
 341             int32_t num, i_chrom      # num: number of reads allowed on a certain chromosome
 342             bytes k
 343             set chrnames
 344
 345         self.total = 0
 346         self.length = 0
 347
 348         chrnames = self.get_chr_names()
 349
 350         if seed >= 0:
 351             np.random.seed(seed)
 352
 353         for k in chrnames:
 354             # for each chromosome.
 355             # This loop body is too big, I may need to split code later...
 356
 357             num = <int32_t>round(self.__locations[k][0].shape[0] * percent, 5 )
 358             np.random.shuffle( self.__locations[k][0] )
 359             self.__locations[k][0].resize( num, refcheck=False )
 360             self.__locations[k][0].sort()
 361             self.__pointer[k][0] = self.__locations[k][0].shape[0]
 362
 363             num = <int32_t>round(self.__locations[k][1].shape[0] * percent, 5 )
 364             np.random.shuffle( self.__locations[k][1] )
 365             self.__locations[k][1].resize( num, refcheck=False )
 366             self.__locations[k][1].sort()
 367             self.__pointer[k][1] = self.__locations[k][1].shape[0]
 368
 369             self.total += self.__pointer[k][0] + self.__pointer[k][1]
 370
 371         self.length = self.fw * self.total
 372         return
 373
 374     cpdef void sample_num (self, uint64_t samplesize, int32_t seed = -1):
 375         """Sample the tags for a given percentage.
 376
 377         Warning: the current object is changed!
 378         """
 379         cdef:
 380             float32_t percent
 381
 382         percent = <float32_t>(samplesize)/self.total
 383         self.sample_percent ( percent, seed )
 384         return
 385
 386     cpdef void print_to_bed (self, fhd=None):
 387         """Output FWTrack to BED format files. If fhd is given,
 388         write to a file, otherwise, output to standard output.
 389
 390         """
 391         cdef:
 392             int32_t i, i_chrom, p
 393             bytes k
 394             set chrnames
 395
 396         if not fhd:
 397             fhd = sys.stdout
 398         assert isinstance(fhd,io.IOBase)
 399         assert self.fw > 0, "FWTrack object .fw should be set larger than 0!"
 400
 401         chrnames = self.get_chr_names()
 402
 403         for k in chrnames:
 404             # for each chromosome.
 405             # This loop body is too big, I may need to split code later...
 406
 407             plus = self.__locations[k][0]
 408
 409             for i in range(plus.shape[0]):
 410                 p = plus[i]
 411                 fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),p,p+self.fw,"+") )
 412
 413             minus = self.__locations[k][1]
 414
 415             for i in range(minus.shape[0]):
 416                 p = minus[i]
 417                 fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),p-self.fw,p,"-") )
 418         return
 419
 420     cpdef tuple extract_region_tags ( self, bytes chromosome, int32_t startpos, int32_t endpos ):
 421         cdef:
 422             int32_t i, pos
 423             np.ndarray[int32_t, ndim=1] rt_plus, rt_minus
 424             list temp
 425             set chrnames
 426
 427         if not self.__sorted: self.sort()
 428
 429         chrnames = self.get_chr_names()
 430         assert chromosome in chrnames, "chromosome %s can't be found in the FWTrack object." % chromosome
 431
 432         (plus, minus) = self.__locations[chromosome]
 433
 434         temp = []
 435         for i in range(plus.shape[0]):
 436             pos = plus[i]
 437             if pos < startpos:
 438                 continue
 439             elif pos > endpos:
 440                 break
 441             else:
 442                 temp.append(pos)
 443         rt_plus = np.array(temp)
 444
 445         temp = []
 446         for i in range(minus.shape[0]):
 447             pos = minus[i]
 448             if pos < startpos:
 449                 continue
 450             elif pos > endpos:
 451                 break
 452             else:
 453                 temp.append(pos)
 454         rt_minus = np.array(temp)
 455         return (rt_plus, rt_minus)
 456
 457     cpdef list compute_region_tags_from_peaks ( self, peaks, func, int32_t window_size = 100, float32_t cutoff = 5 ):
 458         """Extract tags in peak, then apply func on extracted tags.
 459
 460         peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
 461
 462         func:  a function to compute *something* from tags found in a predefined region
 463
 464         window_size: this will be passed to func.
 465
 466         cutoff: this will be passed to func.
 467
 468         func needs the fixed number of parameters, so it's not flexible. Here is an example:
 469
 470         wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff):
 471
 472         """
 473
 474         cdef:
 475             int32_t m, i, j, pre_i, pre_j, pos, startpos, endpos
 476             np.ndarray[int32_t, ndim=1] plus, minus, rt_plus, rt_minus
 477             bytes chrom, name
 478             list temp, retval
 479             set pchrnames, chrnames
 480
 481         pchrnames = peaks.get_chr_names()
 482         retval = []
 483
 484         # this object should be sorted
 485         if not self.__sorted: self.sort()
 486         # PeakIO object should be sorted
 487         peaks.sort()
 488
 489         chrnames = self.get_chr_names()
 490
 491         for chrom in sorted(pchrnames):
 492             assert chrom in chrnames, "chromosome %s can't be found in the FWTrack object." % chrom
 493             (plus, minus) = self.__locations[chrom]
 494             cpeaks = peaks.get_data_from_chrom(chrom)
 495             prev_i = 0
 496             prev_j = 0
 497             for m in range(len(cpeaks)):
 498                 startpos = cpeaks[m]["start"] - window_size
 499                 endpos   = cpeaks[m]["end"] + window_size
 500                 name     = cpeaks[m]["name"]
 501
 502                 temp = []
 503                 for i in range(prev_i,plus.shape[0]):
 504                     pos = plus[i]
 505                     if pos < startpos:
 506                         continue
 507                     elif pos > endpos:
 508                         prev_i = i
 509                         break
 510                     else:
 511                         temp.append(pos)
 512                 rt_plus = np.array(temp, dtype="int32")
 513
 514                 temp = []
 515                 for j in range(prev_j,minus.shape[0]):
 516                     pos = minus[j]
 517                     if pos < startpos:
 518                         continue
 519                     elif pos > endpos:
 520                         prev_j = j
 521                         break
 522                     else:
 523                         temp.append(pos)
 524                 rt_minus = np.array(temp, dtype="int32")
 525
 526                 retval.append( func(chrom, rt_plus, rt_minus, startpos, endpos, name = name, window_size = window_size, cutoff = cutoff) )
 527                 # rewind window_size
 528                 for i in range(prev_i, 0, -1):
 529                     if plus[prev_i] - plus[i] >= window_size:
 530                         break
 531                 prev_i = i
 532
 533                 for j in range(prev_j, 0, -1):
 534                     if minus[prev_j] - minus[j] >= window_size:
 535                         break
 536                 prev_j = j
 537                 # end of a loop
 538
 539         return retval
 540
 541     cpdef list pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float32_t baseline_value = 0.0, bint directional = True, int32_t end_shift = 0 ):
 542         """pileup a certain chromosome, return [p,v] (end position and value) list.
 543
 544         ds             : tag will be extended to this value to 3' direction,
 545                          unless directional is False. Can contain multiple extension
 546                          values. Final pileup will the maximum.
 547         scale_factor_s  : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
 548         baseline_value : a value to be filled for missing values, and will be the minimum pileup.
 549         directional    : if False, the strand or direction of tag will be ignored, so that extension will be both sides with d/2.
 550         end_shift      : move cutting ends towards 5->3 direction if value is positive, or towards 3->5 direction if negative. Default is 0 -- no shift at all.
 551
 552
 553         p and v are numpy.ndarray objects.
 554         """
 555         cdef:
 556             int64_t d
 557             int64_t five_shift, three_shift  # adjustment to 5' end and 3' end positions to make a fragment
 558             dict chrlengths = self.get_rlengths ()
 559             int64_t rlength = chrlengths[chrom]
 560             object ends
 561             list five_shift_s = []
 562             list three_shift_s = []
 563             list tmp_pileup, prev_pileup
 564
 565         assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
 566
 567         # adjust extension length according to 'directional' and 'halfextension' setting.
 568         for d in ds:
 569             if directional:
 570                 # only extend to 3' side
 571                 five_shift_s.append(  - end_shift )
 572                 three_shift_s.append( end_shift + d)
 573             else:
 574                 # both sides
 575                 five_shift_s.append( d//2 - end_shift )
 576                 three_shift_s.append( end_shift + d - d//2)
 577
 578         prev_pileup = None
 579
 580         for i in range(len(ds)):
 581             five_shift = five_shift_s[i]
 582             three_shift = three_shift_s[i]
 583             scale_factor = scale_factor_s[i]
 584             tmp_pileup = se_all_in_one_pileup ( self.__locations[chrom][0], self.__locations[chrom][1], five_shift, three_shift, rlength, scale_factor, baseline_value )
 585
 586             if prev_pileup:
 587                 prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" )
 588             else:
 589                 prev_pileup = tmp_pileup
 590
 591         return prev_pileup
 592
 593 cdef inline int32_t left_sum ( data, int32_t pos, int32_t width ):
 594     """
 595     """
 596     return sum([data[x] for x in data if x <= pos and x >= pos - width])
 597
 598 cdef inline int32_t right_sum ( data, int32_t pos, int32_t width ):
 599     """
 600     """
 601     return sum([data[x] for x in data if x >= pos and x <= pos + width])
 602
 603 cdef inline int32_t left_forward ( data, int32_t pos, int32_t window_size ):
 604     return data.get(pos,0) - data.get(pos-window_size, 0)
 605
 606 cdef inline int32_t right_forward ( data, int32_t pos, int32_t window_size ):
 607     return data.get(pos + window_size, 0) - data.get(pos, 0)
 608