MACS3/Signal/ScoreTrack.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2024-10-10 16:45:13 Tao Liu>
   4
   5 """Module for Feature IO classes.
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file LICENSE included with
   9 the distribution).
  10 """
  11
  12 # ------------------------------------
  13 # python modules
  14 # ------------------------------------
  15 from copy import copy
  16 from functools import reduce
  17
  18 # ------------------------------------
  19 # MACS3 modules
  20 # ------------------------------------
  21 from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness
  22 from MACS3.Signal.Prob import poisson_cdf
  23 from MACS3.IO.PeakIO import PeakIO, BroadPeakIO
  24
  25 # ------------------------------------
  26 # Other modules
  27 # ------------------------------------
  28 cimport cython
  29 import numpy as np
  30 cimport numpy as np
  31 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
  32 from cpython cimport bool
  33 from cykhash import PyObjectMap, Float32to32Map
  34
  35 # ------------------------------------
  36 # C lib
  37 # ------------------------------------
  38 from libc.math cimport log10,log, floor, ceil
  39
  40 # ------------------------------------
  41 # constants
  42 # ------------------------------------
  43 __version__ = "scoreTrack $Revision$"
  44 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
  45 __doc__ = "scoreTrack classes"
  46
  47 # ------------------------------------
  48 # Misc functions
  49 # ------------------------------------
  50 cdef inline int32_t int_max(int32_t a, int32_t b): return a if a >= b else b
  51 cdef inline int32_t int_min(int32_t a, int32_t b): return a if a <= b else b
  52
  53 LOG10_E = 0.43429448190325176
  54
  55 pscore_dict = PyObjectMap()
  56
  57 cdef float32_t get_pscore ( int32_t observed, float32_t expectation ):
  58     """Get p-value score from Poisson test. First check existing
  59     table, if failed, call poisson_cdf function, then store the result
  60     in table.
  61
  62     """
  63     cdef:
  64         float64_t score
  65
  66     try:
  67         return pscore_dict[(observed, expectation)]
  68     except KeyError:
  69         score = -1*poisson_cdf(observed,expectation,False,True)
  70         pscore_dict[(observed, expectation)] = score
  71         return score
  72
  73 asym_logLR_dict = PyObjectMap()
  74
  75 cdef float32_t logLR_asym ( float32_t x, float32_t y ):
  76     """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
  77     chromatin bias ). Set minus sign for depletion.
  78
  79     *asymmetric version*
  80
  81     """
  82     cdef:
  83         float32_t s
  84
  85     if (x,y) in asym_logLR_dict:
  86         return asym_logLR_dict[ ( x, y ) ]
  87     else:
  88         if x > y:
  89             s = (x*(log(x)-log(y))+y-x)*LOG10_E
  90         elif x < y:
  91             s = (x*(-log(x)+log(y))-y+x)*LOG10_E
  92         else:
  93             s = 0
  94         asym_logLR_dict[ ( x, y ) ] = s
  95         return s
  96
  97 sym_logLR_dict = PyObjectMap()
  98
  99 cdef float32_t logLR_sym ( float32_t x, float32_t y ):
 100     """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
 101     another enriched ). Set minus sign for H0>H1.
 102
 103     * symmetric version *
 104
 105     """
 106     cdef:
 107         float32_t s
 108
 109     if (x,y) in sym_logLR_dict:
 110         return sym_logLR_dict[ ( x, y ) ]
 111     else:
 112         if x > y:
 113             s = (x*(log(x)-log(y))+y-x)*LOG10_E
 114         elif y > x:
 115             s = (y*(log(x)-log(y))+y-x)*LOG10_E
 116         else:
 117             s = 0
 118         sym_logLR_dict[ ( x, y ) ] = s
 119         return s
 120
 121 cdef float32_t get_logFE ( float32_t x, float32_t y ):
 122     """ return 100* log10 fold enrichment with +1 pseudocount.
 123     """
 124     return log10( x/y )
 125
 126 cdef float32_t get_subtraction ( float32_t x, float32_t y):
 127     """ return subtraction.
 128     """
 129     return x - y
 130
 131 # ------------------------------------
 132 # Classes
 133 # ------------------------------------
 134
 135 cdef class ScoreTrackII:
 136     """Class for a container to keep signals of each genomic position,
 137     including 1. score, 2. treatment and 2. control pileup.
 138
 139     It also contains scoring methods and call_peak functions.
 140     """
 141     cdef:
 142         dict data                       # dictionary for data of each chromosome
 143         dict datalength                 # length of data array of each chromosome
 144         bool trackline                  # whether trackline should be saved in bedGraph
 145         float32_t treat_edm             # seq depth in million of treatment
 146         float32_t ctrl_edm              # seq depth in million of control
 147         char scoring_method             # method for calculating scores.
 148         char normalization_method       # scale to control? scale to treatment? both scale to 1million reads?
 149         float32_t pseudocount           # the pseudocount used to calcuate logLR, FE or logFE
 150         float32_t cutoff
 151         dict pvalue_stat                # save pvalue<->length dictionary
 152
 153
 154     def __init__ (self, float32_t treat_depth, float32_t ctrl_depth, float32_t pseudocount = 1.0 ):
 155         """Initialize.
 156
 157         treat_depth and ctrl_depth are effective depth in million:
 158                                     sequencing depth in million after
 159                                     duplicates being filtered. If
 160                                     treatment is scaled down to
 161                                     control sample size, then this
 162                                     should be control sample size in
 163                                     million. And vice versa.
 164
 165         pseudocount: a pseudocount used to calculate logLR, FE or
 166                      logFE. Please note this value will not be changed
 167                      with normalization method. So if you really want
 168                      to set pseudocount 1 per million reads, set it
 169                      after you normalize treat and control by million
 170                      reads by `change_normalizetion_method(ord('M'))`.
 171
 172         """
 173         self.data = {}           # for each chromosome, there is a l*4
 174                                  # matrix. First column: end position
 175                                  # of a region; Second: treatment
 176                                  # pileup; third: control pileup ;
 177                                  # forth: score ( can be
 178                                  # p/q-value/likelihood
 179                                  # ratio/fold-enrichment/subtraction
 180                                  # depending on -c setting)
 181         self.datalength = {}
 182         self.trackline = False
 183         self.treat_edm = treat_depth
 184         self.ctrl_edm = ctrl_depth
 185         #scoring_method:  p: -log10 pvalue;
 186         #                 q: -log10 qvalue;
 187         #                 l: log10 likelihood ratio ( minus for depletion )
 188         #                 f: log10 fold enrichment
 189         #                 F: linear fold enrichment
 190         #                 d: subtraction
 191         #                 m: fragment pileup per million reads
 192         #                 N: not set
 193         self.scoring_method = ord("N")
 194
 195         #normalization_method: T: scale to depth of treatment;
 196         #                      C: scale to depth of control;
 197         #                      M: scale to depth of 1 million;
 198         #                      N: not set/ raw pileup
 199         self.normalization_method = ord("N")
 200
 201         self.pseudocount = pseudocount
 202         self.pvalue_stat = {}
 203
 204     cpdef set_pseudocount( self, float32_t pseudocount ):
 205         self.pseudocount = pseudocount
 206
 207     cpdef enable_trackline( self ):
 208         """Turn on trackline with bedgraph output
 209         """
 210         self.trackline = True
 211
 212     cpdef add_chromosome ( self, bytes chrom, int32_t chrom_max_len ):
 213         """
 214         chrom: chromosome name
 215         chrom_max_len: maximum number of data points in this chromosome
 216
 217         """
 218         if chrom not in self.data:
 219             #self.data[chrom] = np.zeros( ( chrom_max_len, 4 ), dtype="int32" ) # remember col #2-4 is actual value * 100, I use integer here.
 220             self.data[chrom] = [ np.zeros( chrom_max_len, dtype="int32" ), # pos
 221                                  np.zeros( chrom_max_len, dtype="float32" ), # pileup at each interval, in float32 format
 222                                  np.zeros( chrom_max_len, dtype="float32" ), # control at each interval, in float32 format
 223                                  np.zeros( chrom_max_len, dtype="float32" ) ] # score at each interval, in float32 format
 224             self.datalength[chrom] = 0
 225
 226     cpdef add (self, bytes chromosome, int32_t endpos, float32_t chip, float32_t control):
 227         """Add a chr-endpos-sample-control block into data
 228         dictionary.
 229
 230         chromosome: chromosome name in string
 231         endpos    : end position of each interval in integer
 232         chip      : ChIP pileup value of each interval in float
 233         control   : Control pileup value of each interval in float
 234
 235         *Warning* Need to add regions continuously.
 236         """
 237         cdef int32_t i
 238         i = self.datalength[chromosome]
 239         c = self.data[chromosome]
 240         c[0][ i ] = endpos
 241         c[1][ i ] = chip
 242         c[2][ i ] = control
 243         self.datalength[chromosome] += 1
 244
 245     cpdef finalize ( self ):
 246         """
 247         Adjust array size of each chromosome.
 248
 249         """
 250         cdef:
 251             bytes chrom, k
 252             int32_t l
 253
 254         for chrom in sorted(self.data.keys()):
 255             d = self.data[chrom]
 256             l = self.datalength[chrom]
 257             d[0].resize( l, refcheck = False )
 258             d[1].resize( l, refcheck = False )
 259             d[2].resize( l, refcheck = False )
 260             d[3].resize( l, refcheck = False )
 261         return
 262
 263     cpdef get_data_by_chr (self, bytes chromosome):
 264         """Return array of counts by chromosome.
 265
 266         The return value is a tuple:
 267         ([end pos],[value])
 268         """
 269         if chromosome in self.data:
 270             return self.data[chromosome]
 271         else:
 272             return None
 273
 274     cpdef get_chr_names (self):
 275         """Return all the chromosome names stored.
 276
 277         """
 278         l = set(self.data.keys())
 279         return l
 280
 281     cpdef change_normalization_method ( self, char normalization_method ):
 282         """Change/set normalization method. However, I do not
 283         recommend change this back and forward, since some precision
 284         issue will happen -- I only keep two digits.
 285
 286         normalization_method: T: scale to depth of treatment;
 287                               C: scale to depth of control;
 288                               M: scale to depth of 1 million;
 289                               N: not set/ raw pileup
 290         """
 291         if normalization_method == ord('T'):
 292             if self.normalization_method == ord('T'): # do nothing
 293                 pass
 294             elif self.normalization_method == ord('C'):
 295                 self.normalize( self.treat_edm/self.ctrl_edm, self.treat_edm/self.ctrl_edm )
 296             elif  self.normalization_method == ord('M'):
 297                 self.normalize( self.treat_edm, self.treat_edm )
 298             elif self.normalization_method == ord('N'):
 299                 self.normalize( 1, self.treat_edm/self.ctrl_edm )
 300             else:
 301                 raise NotImplemented
 302             self.normalization_method = ord('T')
 303         elif normalization_method == ord('C'):
 304             if self.normalization_method == ord('T'):
 305                 self.normalize( self.ctrl_edm/self.treat_edm, self.ctrl_edm/self.treat_edm )
 306             elif self.normalization_method == ord('C'): # do nothing
 307                 pass
 308             elif  self.normalization_method == ord('M'):
 309                 self.normalize( self.ctrl_edm, self.ctrl_edm )
 310             elif self.normalization_method == ord('N'):
 311                 self.normalize( self.ctrl_edm/self.treat_edm, 1 )
 312             else:
 313                 raise NotImplemented
 314             self.normalization_method = ord('C')
 315         elif normalization_method == ord('M'):
 316             if self.normalization_method == ord('T'):
 317                 self.normalize( 1/self.treat_edm, 1/self.treat_edm )
 318             elif self.normalization_method == ord('C'):
 319                 self.normalize( 1/self.ctrl_edm, 1/self.ctrl_edm )
 320             elif  self.normalization_method == ord('M'): # do nothing
 321                 pass
 322             elif self.normalization_method == ord('N'):
 323                 self.normalize( 1/self.treat_edm, 1/self.ctrl_edm )
 324             else:
 325                 raise NotImplemented
 326             self.normalization_method = ord('M')
 327         elif normalization_method == ord('N'):
 328             if self.normalization_method == ord('T'):
 329                 self.normalize( self.treat_edm, self.treat_edm )
 330             elif self.normalization_method == ord('C'):
 331                 self.normalize( self.ctrl_edm, self.ctrl_edm )
 332             elif  self.normalization_method == ord('M'):
 333                 self.normalize( self.treat_edm, self.ctrl_edm )
 334             elif self.normalization_method == ord('N'): # do nothing
 335                 pass
 336             else:
 337                 raise NotImplemented
 338             self.normalization_method = ord('N')
 339
 340     cdef normalize ( self, float32_t treat_scale, float32_t control_scale ):
 341         cdef:
 342             np.ndarray p, c
 343             int64_t l, i
 344
 345         for chrom in sorted(self.data.keys()):
 346             p = self.data[chrom][1]
 347             c = self.data[chrom][2]
 348             l = self.datalength[chrom]
 349             for i in range(l):
 350                 p[ i ] *= treat_scale
 351                 c[ i ] *= control_scale
 352         return
 353
 354     cpdef change_score_method (self, char scoring_method):
 355         """
 356         scoring_method:  p: -log10 pvalue;
 357                          q: -log10 qvalue;
 358                          l: log10 likelihood ratio ( minus for depletion )
 359                          s: symmetric log10 likelihood ratio ( for comparing two ChIPs )
 360                          f: log10 fold enrichment
 361                          F: linear fold enrichment
 362                          d: subtraction
 363                          M: maximum
 364                          m: fragment pileup per million reads
 365         """
 366         if scoring_method == ord('p'):
 367             self.compute_pvalue()
 368         elif scoring_method == ord('q'):
 369             #if not already calculated p, compute pvalue first
 370             if self.scoring_method != ord('p'):
 371                 self.compute_pvalue()
 372             self.compute_qvalue()
 373         elif scoring_method == ord('l'):
 374             self.compute_likelihood()
 375         elif scoring_method == ord('s'):
 376             self.compute_sym_likelihood()
 377         elif scoring_method == ord('f'):
 378             self.compute_logFE()
 379         elif scoring_method == ord('F'):
 380             self.compute_foldenrichment()
 381         elif scoring_method == ord('d'):
 382             self.compute_subtraction()
 383         elif scoring_method == ord('m'):
 384             self.compute_SPMR()
 385         elif scoring_method == ord('M'):
 386             self.compute_max()
 387         else:
 388             raise NotImplemented
 389
 390     cdef compute_pvalue ( self ):
 391         """Compute -log_{10}(pvalue)
 392         """
 393         cdef:
 394             np.ndarray[np.float32_t] p, c, v
 395             np.ndarray[np.int32_t] pos
 396             int64_t l, i, prev_pos
 397             bytes chrom
 398
 399         for chrom in sorted(self.data.keys()):
 400             prev_pos = 0
 401             pos = self.data[chrom][0]
 402             p = self.data[chrom][1]
 403             c = self.data[chrom][2]
 404             v = self.data[chrom][3]
 405             l = self.datalength[chrom]
 406             for i in range(l):
 407                 v[ i ] =  get_pscore( <int32_t>(p[ i ]  + self.pseudocount) , c[ i ]  + self.pseudocount )
 408                 try:
 409                     self.pvalue_stat[v[ i ]] += pos[ i ] - prev_pos
 410                 except:
 411                     self.pvalue_stat[v[ i ]] = pos[ i ] - prev_pos
 412                 prev_pos = pos[ i ]
 413
 414         self.scoring_method = ord('p')
 415         return
 416
 417     cdef compute_qvalue ( self ):
 418         """Compute -log_{10}(qvalue)
 419         """
 420         cdef:
 421             object pqtable
 422             int64_t i,l,j
 423             bytes chrom
 424             np.ndarray p, c, v
 425
 426         # pvalue should be computed first!
 427         assert self.scoring_method == ord('p')
 428         # make pqtable
 429         pqtable = self.make_pq_table()
 430
 431         # convert p to q
 432         for chrom in sorted(self.data.keys()):
 433             v = self.data[chrom][3]
 434             l = self.datalength[chrom]
 435             for i in range(l):
 436                 v[ i ] = pqtable[ v[ i ] ]
 437                 #v [ i ] =  g( v[ i ])
 438
 439         self.scoring_method = ord('q')
 440         return
 441
 442     cpdef object make_pq_table ( self ):
 443         """Make pvalue-qvalue table.
 444
 445         Step1: get all pvalue and length of block with this pvalue
 446         Step2: Sort them
 447         Step3: Apply AFDR method to adjust pvalue and get qvalue for each pvalue
 448
 449         Return a dictionary of {-log10pvalue:(-log10qvalue,rank,basepairs)} relationships.
 450         """
 451         cdef:
 452             int64_t n, pre_p, this_p, length, pre_l, l, i, j
 453             float32_t this_v, pre_v, v, q, pre_q # store the p and q scores
 454             int64_t N, k
 455             float32_t f
 456             bytes chrom
 457             np.ndarray v_chrom, pos_chrom
 458             object pvalue2qvalue
 459             dict pvalue_stat
 460             list unique_values
 461
 462         assert self.scoring_method == ord('p')
 463
 464         pvalue_stat = self.pvalue_stat
 465
 466         N = sum(pvalue_stat.values())
 467         k = 1                           # rank
 468         f = -log10(N)
 469         pre_v = -2147483647
 470         pre_l = 0
 471         pre_q = 2147483647              # save the previous q-value
 472
 473         pvalue2qvalue = Float32to32Map( for_int = False )
 474         unique_values = sorted(list(pvalue_stat.keys()), reverse=True)
 475         for i in range(len(unique_values)):
 476             v = unique_values[i]
 477             l = pvalue_stat[v]
 478             q = v + (log10(k) + f)
 479             if q > pre_q:
 480                 q = pre_q
 481             if q <= 0:
 482                 q = 0
 483                 break
 484             pvalue2qvalue[ v ] = q
 485             pre_q = q
 486             k+=l
 487         # bottom rank pscores all have qscores 0
 488         for j in range(i, len(unique_values) ):
 489             v = unique_values[ j ]
 490             pvalue2qvalue[ v ] = 0
 491         return pvalue2qvalue
 492
 493     cdef compute_likelihood ( self ):
 494         """Calculate log10 likelihood.
 495
 496         """
 497         cdef:
 498             #np.ndarray v, p, c
 499             int64_t l, i
 500             bytes chrom
 501             float32_t v1, v2
 502             float32_t pseudocount
 503
 504         pseudocount = self.pseudocount
 505
 506         for chrom in sorted(self.data.keys()):
 507             p = self.data[chrom][ 1 ].flat.__next__ # pileup in treatment
 508             c = self.data[chrom][ 2 ].flat.__next__ # pileup in control
 509             v = self.data[chrom][ 3 ]               # score
 510             l = self.datalength[chrom]
 511             v1 = 2
 512             v2 = 1
 513             for i in range(l):
 514                 v1 = p()
 515                 v2 = c()
 516                 v[ i ] =  logLR_asym( v1 + pseudocount, v2 + pseudocount )  #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
 517                 #print v1, v2, v[i]
 518         self.scoring_method = ord('l')
 519         return
 520
 521     cdef compute_sym_likelihood ( self ):
 522         """Calculate symmetric log10 likelihood.
 523
 524         """
 525         cdef:
 526             #np.ndarray v, p, c
 527             int64_t l, i
 528             bytes chrom
 529             float32_t v1, v2
 530             float32_t pseudocount
 531
 532         pseudocount = self.pseudocount
 533
 534         for chrom in sorted(self.data.keys()):
 535             p = self.data[chrom][ 1 ].flat.__next__
 536             c = self.data[chrom][ 2 ].flat.__next__
 537             v = self.data[chrom][ 3 ]
 538             l = self.datalength[chrom]
 539             v1 = 2
 540             v2 = 1
 541             for i in range(l):
 542                 v1 = p()
 543                 v2 = c()
 544                 v[ i ] =  logLR_sym( v1 + pseudocount, v2 + pseudocount )  #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
 545         self.scoring_method = ord('s')
 546         return
 547
 548     cdef compute_logFE ( self ):
 549         """Calculate log10 fold enrichment ( with 1 pseudocount ).
 550
 551         """
 552         cdef:
 553             np.ndarray p, c, v
 554             int64_t l, i
 555             float32_t pseudocount
 556
 557         pseudocount = self.pseudocount
 558
 559         for chrom in sorted(self.data.keys()):
 560             p = self.data[chrom][1]
 561             c = self.data[chrom][2]
 562             v = self.data[chrom][3]
 563             l = self.datalength[chrom]
 564             for i in range(l):
 565                 v[ i ] = get_logFE ( p[ i ] + pseudocount, c[ i ] + pseudocount)
 566         self.scoring_method = ord('f')
 567         return
 568
 569     cdef compute_foldenrichment ( self ):
 570         """Calculate linear scale fold enrichment ( with 1 pseudocount ).
 571
 572         """
 573         cdef:
 574             np.ndarray p, c, v
 575             int64_t l, i
 576             float32_t pseudocount
 577
 578         pseudocount = self.pseudocount
 579
 580         for chrom in sorted(self.data.keys()):
 581             p = self.data[chrom][1]
 582             c = self.data[chrom][2]
 583             v = self.data[chrom][3]
 584             l = self.datalength[chrom]
 585             for i in range(l):
 586                 v[ i ] =  ( p[ i ] + pseudocount )/( c[ i ] + pseudocount )
 587         self.scoring_method = ord('F')
 588         return
 589
 590     cdef compute_subtraction ( self ):
 591         cdef:
 592             np.ndarray p, c, v
 593             int64_t l, i
 594
 595         for chrom in sorted(self.data.keys()):
 596             p = self.data[chrom][1]
 597             c = self.data[chrom][2]
 598             v = self.data[chrom][3]
 599             l = self.datalength[chrom]
 600             for i in range(l):
 601                 v[ i ] = p[ i ] - c[ i ]
 602         self.scoring_method = ord('d')
 603         return
 604
 605     cdef compute_SPMR ( self ):
 606         cdef:
 607             np.ndarray p, v
 608             int64_t l, i
 609             float32_t scale
 610         if self.normalization_method == ord('T') or self.normalization_method == ord('N'):
 611             scale = self.treat_edm
 612         elif self.normalization_method == ord('C'):
 613             scale = self.ctrl_edm
 614         elif self.normalization_method == ord('M'):
 615             scale = 1
 616
 617         for chrom in sorted(self.data.keys()):
 618             p = self.data[chrom][1]
 619             v = self.data[chrom][3]
 620             l = self.datalength[chrom]
 621             for i in range(l):
 622                 v[ i ] =  p[ i ] / scale # two digit precision may not be enough...
 623         self.scoring_method = ord('m')
 624         return
 625
 626     cdef compute_max ( self ):
 627         cdef:
 628             np.ndarray p, c, v
 629             int64_t l, i
 630
 631         for chrom in sorted(self.data.keys()):
 632             p = self.data[chrom][1]
 633             c = self.data[chrom][2]
 634             v = self.data[chrom][3]
 635             l = self.datalength[chrom]
 636             for i in range(l):
 637                 v[ i ] = max(p[ i ],c[ i ])
 638         self.scoring_method = ord('M')
 639         return
 640
 641     cpdef write_bedGraph ( self, fhd, str name, str description, short column = 3):
 642         """Write all data to fhd in bedGraph Format.
 643
 644         fhd: a filehandler to save bedGraph.
 645
 646         name/description: the name and description in track line.
 647
 648         colname: can be 1: chip, 2: control, 3: score
 649
 650         """
 651         cdef:
 652             bytes chrom
 653             int32_t l, pre, i, p
 654             float32_t pre_v, v
 655             set chrs
 656             np.ndarray pos, value
 657
 658         assert column in range( 1, 4 ), "column should be between 1, 2 or 3."
 659
 660         write = fhd.write
 661
 662         if self.trackline:
 663             # this line is REQUIRED by the wiggle format for UCSC browser
 664             write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
 665
 666         chrs = self.get_chr_names()
 667         for chrom in sorted(chrs):
 668             pos = self.data[ chrom ][ 0 ]
 669             value = self.data[ chrom ][ column ]
 670             l = self.datalength[ chrom ]
 671             pre = 0
 672             if pos.shape[ 0 ] == 0: continue # skip if there's no data
 673             pre_v = value[ 0 ]
 674             for i in range( 1, l ):
 675                 v = value[ i ]
 676                 p = pos[ i-1 ]
 677                 #if ('%.5f' % pre_v) != ('%.5f' % v):
 678                 if abs(pre_v - v) > 1e-5: # precision is 5 digits
 679                     write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
 680                     pre_v = v
 681                     pre = p
 682             p = pos[ -1 ]
 683             # last one
 684             write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
 685
 686         return True
 687
 688     cpdef call_peaks (self, float32_t cutoff=5.0, int32_t min_length=200, int32_t max_gap=50, bool call_summits=False):
 689         """This function try to find regions within which, scores
 690         are continuously higher than a given cutoff.
 691
 692         This function is NOT using sliding-windows. Instead, any
 693         regions in bedGraph above certain cutoff will be detected,
 694         then merged if the gap between nearby two regions are below
 695         max_gap. After this, peak is reported if its length is above
 696         min_length.
 697
 698         cutoff:  cutoff of value, default 5. For -log10pvalue, it means 10^-5.
 699         min_length :  minimum peak length, default 200.
 700         max_gap   :  maximum gap to merge nearby peaks, default 50.
 701         acll_summits:
 702         """
 703         cdef:
 704             int32_t i
 705             bytes chrom
 706             np.ndarray pos, sample, control, value, above_cutoff, above_cutoff_v, above_cutoff_endpos, above_cutoff_startpos, above_cutoff_sv
 707             list peak_content
 708
 709         chrs  = self.get_chr_names()
 710         peaks = PeakIO()                      # dictionary to save peaks
 711
 712         self.cutoff = cutoff
 713         for chrom in sorted(chrs):
 714             peak_content = []           # to store points above cutoff
 715
 716             pos = self.data[chrom][ 0 ]
 717             sample = self.data[chrom][ 1 ]
 718             control = self.data[chrom][ 2 ]
 719             value = self.data[chrom][ 3 ]
 720
 721             above_cutoff = np.nonzero( value >= cutoff )[0] # indices where score is above cutoff
 722             above_cutoff_v = value[above_cutoff] # scores where score is above cutoff
 723
 724             above_cutoff_endpos = pos[above_cutoff] # end positions of regions where score is above cutoff
 725             above_cutoff_startpos = pos[above_cutoff-1] # start positions of regions where score is above cutoff
 726             above_cutoff_sv= sample[above_cutoff] # sample pileup height where score is above cutoff
 727
 728             if above_cutoff_v.size == 0:
 729                 # nothing above cutoff
 730                 continue
 731
 732             if above_cutoff[0] == 0:
 733                 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
 734                 above_cutoff_startpos[0] = 0
 735
 736             # first bit of region above cutoff
 737             peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], above_cutoff_v[0], above_cutoff_sv[0], above_cutoff[0]) )
 738             for i in range( 1,above_cutoff_startpos.size ):
 739                 if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap:
 740                     # append
 741                     peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i], above_cutoff_sv[i], above_cutoff[i]) )
 742                 else:
 743                     # close
 744                     if call_summits:
 745                         self.__close_peak2(peak_content, peaks, min_length, chrom, max_gap//2 )
 746                     else:
 747                         self.__close_peak(peak_content, peaks, min_length, chrom )
 748                     peak_content = [(above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i], above_cutoff_sv[i], above_cutoff[i]),]
 749
 750             # save the last peak
 751             if not peak_content:
 752                 continue
 753             else:
 754                 if call_summits:
 755                     self.__close_peak2(peak_content, peaks, min_length, chrom, max_gap//2 )
 756                 else:
 757                     self.__close_peak(peak_content, peaks, min_length, chrom )
 758
 759         return peaks
 760
 761     cdef bool __close_peak (self, list peak_content, object peaks, int32_t min_length,
 762                             bytes chrom):
 763         """Close the peak region, output peak boundaries, peak summit
 764         and scores, then add the peak to peakIO object.
 765
 766         In this function, we define the peak summit as the middle
 767         point of the region with the highest score, in this peak. For
 768         example, if the region of the highest score is from 100 to
 769         200, the summit is 150. If there are several regions of the
 770         same 'highest score', we will first calculate the possible
 771         summit for each such region, then pick a position close to the
 772         middle index ( = (len(highest_regions) + 1) / 2 ) of these
 773         summits. For example, if there are three regions with the same
 774         highest scores, [100,200], [300,400], [600,700], we will first
 775         find the possible summits as 150, 350, and 650, and then pick
 776         the middle index, the 2nd, of the three positions -- 350 as
 777         the final summit. If there are four regions, we pick the 2nd
 778         as well.
 779
 780         peaks: a PeakIO object
 781
 782         """
 783         cdef:
 784             int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
 785             float32_t summit_value, tvalue, tsummitvalue
 786
 787         peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
 788         if peak_length >= min_length: # if the peak is too small, reject it
 789             tsummit = []
 790             summit_pos   = 0
 791             summit_value = 0
 792             for i in range(len(peak_content)):
 793                 (tstart,tend,tvalue,tsummitvalue, tindex) = peak_content[i]
 794                 #for (tstart,tend,tvalue,tsummitvalue, tindex) in peak_content:
 795                 if not summit_value or summit_value < tsummitvalue:
 796                     tsummit = [(tend + tstart) / 2, ]
 797                     tsummit_index = [ tindex, ]
 798                     summit_value = tsummitvalue
 799                 elif summit_value == tsummitvalue:
 800                     # remember continuous summit values
 801                     tsummit.append(int((tend + tstart) / 2))
 802                     tsummit_index.append( tindex )
 803             # the middle of all highest points in peak region is defined as summit
 804             midindex = int((len(tsummit) + 1) / 2) - 1
 805             summit_pos    = tsummit[ midindex ]
 806             summit_index  = tsummit_index[ midindex ]
 807             if self.scoring_method == ord('q'):
 808                 qscore = self.data[chrom][3][ summit_index ]
 809             else:
 810                 # if q value is not computed, use -1
 811                 qscore = -1
 812
 813             peaks.add( chrom,
 814                        peak_content[0][0],
 815                        peak_content[-1][1],
 816                        summit      = summit_pos,
 817                        peak_score  = self.data[chrom][ 3 ][ summit_index ],
 818                        pileup      = self.data[chrom][ 1 ][ summit_index ], # should be the same as summit_value
 819                        pscore      = get_pscore(self.data[chrom][ 1 ][ summit_index ], self.data[chrom][ 2 ][ summit_index ]),
 820                        fold_change = ( self.data[chrom][ 1 ][ summit_index ] + self.pseudocount ) / ( self.data[chrom][ 2 ][ summit_index ] + self.pseudocount ),
 821                        qscore      = qscore,
 822                        )
 823             # start a new peak
 824             return True
 825
 826     cdef bool __close_peak2 (self, list peak_content, object peaks, int32_t min_length,
 827                              bytes chrom, int32_t smoothlen=51,
 828                              float32_t min_valley = 0.9):
 829         """Close the peak region, output peak boundaries, peak summit
 830         and scores, then add the peak to peakIO object.
 831
 832         In this function, we use signal processing methods to smooth
 833         the scores in the peak region, find the maxima and enforce the
 834         peaky shape, and to define the best maxima as the peak
 835         summit. The functions used for signal processing is 'maxima'
 836         (with 2nd order polynomial filter) and 'enfoce_peakyness'
 837         functions in SignalProcessing.pyx.
 838
 839         peaks: a PeakIO object
 840
 841         """
 842         cdef:
 843             int32_t summit_pos, tstart, tend, tmpindex, summit_index, summit_offset
 844             int32_t start, end, i, j, start_boundary
 845             float32_t summit_value, tvalue, tsummitvalue
 846 #            np.ndarray[np.float32_t, ndim=1] w
 847             np.ndarray[np.float32_t, ndim=1] peakdata
 848             np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets
 849
 850         # Add 10 bp padding to peak region so that we can get true minima
 851         end = peak_content[ -1 ][ 1 ] + 10
 852         start = peak_content[ 0 ][ 0 ] - 10
 853         if start < 0:
 854             start_boundary = 10 + start
 855             start = 0
 856         else:
 857             start_boundary = 10
 858         peak_length = end - start
 859         if end - start < min_length: return # if the region is too small, reject it
 860
 861         peakdata = np.zeros(end - start, dtype='float32')
 862         peakindices = np.zeros(end - start, dtype='int32')
 863         for (tstart,tend,tvalue,tsvalue, tmpindex) in peak_content:
 864             i = tstart - start + start_boundary
 865             j = tend - start + start_boundary
 866             peakdata[i:j] = tsvalue
 867             peakindices[i:j] = tmpindex
 868         summit_offsets = maxima(peakdata, smoothlen)
 869         if summit_offsets.shape[0] == 0:
 870             # **failsafe** if no summits, fall back on old approach #
 871             return self.__close_peak(peak_content, peaks, min_length, chrom)
 872         else:
 873             # remove maxima that occurred in padding
 874             i = np.searchsorted(summit_offsets, start_boundary)
 875             j = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right')
 876             summit_offsets = summit_offsets[i:j]
 877
 878         summit_offsets = enforce_peakyness(peakdata, summit_offsets)
 879         if summit_offsets.shape[0] == 0:
 880             # **failsafe** if no summits, fall back on old approach #
 881             return self.__close_peak(peak_content, peaks, min_length, chrom)
 882
 883         summit_indices = peakindices[summit_offsets]
 884         summit_offsets -= start_boundary
 885
 886         peak_scores  = self.data[chrom][3][ summit_indices ]
 887         if not (peak_scores > self.cutoff).all():
 888             return self.__close_peak(peak_content, peaks, min_length, chrom)
 889         for summit_offset, summit_index in zip(summit_offsets, summit_indices):
 890             if self.scoring_method == ord('q'):
 891                 qscore = self.data[chrom][3][ summit_index ]
 892             else:
 893                 # if q value is not computed, use -1
 894                 qscore = -1
 895             peaks.add( chrom,
 896                        start,
 897                        end,
 898                        summit      = start + summit_offset,
 899                        peak_score  = self.data[chrom][3][ summit_index ],
 900                        pileup      = self.data[chrom][1][ summit_index ], # should be the same as summit_value
 901                        pscore      = get_pscore(self.data[chrom][ 1 ][ summit_index ], self.data[chrom][ 2 ][ summit_index ]),
 902                        fold_change = ( self.data[chrom][ 1 ][ summit_index ] + self.pseudocount ) / ( self.data[chrom][ 2 ][ summit_index ] + self.pseudocount ),
 903                        qscore      = qscore,
 904                        )
 905         # start a new peak
 906         return True
 907
 908     cdef int64_t total ( self ):
 909         """Return the number of regions in this object.
 910
 911         """
 912         cdef:
 913             int64_t t
 914             bytes chrom
 915
 916         t = 0
 917         for chrom in sorted(self.data.keys()):
 918             t += self.datalength[chrom]
 919         return t
 920
 921     cpdef call_broadpeaks (self, float32_t lvl1_cutoff=5.0, float32_t lvl2_cutoff=1.0, int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400):
 922         """This function try to find enriched regions within which,
 923         scores are continuously higher than a given cutoff for level
 924         1, and link them using the gap above level 2 cutoff with a
 925         maximum length of lvl2_max_gap.
 926
 927         lvl1_cutoff:  cutoff of value at enriched regions, default 5.0.
 928         lvl2_cutoff:  cutoff of value at linkage regions, default 1.0.
 929         min_length :  minimum peak length, default 200.
 930         lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
 931         lvl2_max_gap   :  maximum length of linkage regions, default 400.
 932
 933         Return both general PeakIO object for highly enriched regions
 934         and gapped broad regions in BroadPeakIO.
 935         """
 936         cdef:
 937             int32_t i
 938             bytes chrom
 939
 940         assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
 941         assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
 942         lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap)
 943         lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap)
 944         chrs = lvl1_peaks.peaks.keys()
 945         broadpeaks = BroadPeakIO()
 946         # use lvl2_peaks as linking regions between lvl1_peaks
 947         for chrom in sorted(chrs):
 948             lvl1peakschrom = lvl1_peaks.peaks[chrom]
 949             lvl2peakschrom = lvl2_peaks.peaks[chrom]
 950             lvl1peakschrom_next = iter(lvl1peakschrom).__next__
 951             tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
 952             # our assumption is lvl1 regions should be included in lvl2 regions
 953             try:
 954                 lvl1 = lvl1peakschrom_next()
 955                 for i in range( len(lvl2peakschrom) ):
 956                     # for each lvl2 peak, find all lvl1 peaks inside
 957                     # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
 958                     lvl2 = lvl2peakschrom[i]
 959
 960                     while True:
 961                         if lvl2["start"] <= lvl1["start"]  and lvl1["end"] <= lvl2["end"]:
 962                             tmppeakset.append(lvl1)
 963                             lvl1 = lvl1peakschrom_next()
 964                         else:
 965                             # make a hierarchical broad peak
 966                             #print lvl2["start"], lvl2["end"], lvl2["score"]
 967                             self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
 968                             tmppeakset = []
 969                             break
 970             except StopIteration:
 971                 # no more strong (aka lvl1) peaks left
 972                 self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
 973                 tmppeakset = []
 974                 # add the rest lvl2 peaks
 975                 for j in range( i+1, len(lvl2peakschrom) ):
 976                     self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset )
 977
 978         return broadpeaks
 979
 980     def __add_broadpeak (self, bpeaks, bytes chrom, dict lvl2peak, list lvl1peakset):
 981         """Internal function to create broad peak.
 982         """
 983
 984         cdef:
 985             int32_t blockNum, thickStart, thickEnd, start, end
 986             bytes blockSizes, blockStarts
 987
 988         start      = lvl2peak["start"]
 989         end        = lvl2peak["end"]
 990
 991         # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
 992         if not lvl1peakset:
 993             # will complement by adding 1bps start and end to this region
 994             # may change in the future if gappedPeak format was improved.
 995             bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
 996                        blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
 997                        pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
 998                        qscore = lvl2peak["qscore"] )
 999             return bpeaks
1000
1001         thickStart = b"%d" % lvl1peakset[0]["start"]
1002         thickEnd   = b"%d" % lvl1peakset[-1]["end"]
1003         blockNum   = int(len(lvl1peakset))
1004         blockSizes = b",".join( [b"%d" % x["length"] for x in lvl1peakset] )
1005         blockStarts = b",".join( [b"%d" % (x["start"]-start) for x in lvl1peakset] )
1006
1007         if lvl2peak["start"] != thickStart:
1008             # add 1bp mark for the start of lvl2 peak
1009             thickStart = b"%d" % start
1010             blockNum += 1
1011             blockSizes = b"1,"+blockSizes
1012             blockStarts = b"0,"+blockStarts
1013         if lvl2peak["end"] != thickEnd:
1014             # add 1bp mark for the end of lvl2 peak
1015             thickEnd = b"%d" % end
1016             blockNum += 1
1017             blockSizes = blockSizes+b",1"
1018             blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
1019
1020         # add to BroadPeakIO object
1021         bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
1022                    blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts,  pileup = lvl2peak["pileup"],
1023                    pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
1024                    qscore = lvl2peak["qscore"] )
1025         return bpeaks
1026
1027 cdef class TwoConditionScores:
1028     """Class for saving two condition comparison scores.
1029     """
1030     cdef:
1031         dict data                       # dictionary for data of each chromosome
1032         dict datalength                 # length of data array of each chromosome
1033         float32_t cond1_factor              # factor to apply to cond1 pileup values
1034         float32_t cond2_factor              # factor to apply to cond2 pileup values
1035         float32_t pseudocount               # the pseudocount used to calcuate LLR
1036         float32_t cutoff
1037         object t1bdg, c1bdg, t2bdg, c2bdg
1038         dict pvalue_stat1, pvalue_stat2, pvalue_stat3
1039
1040     def __init__ (self, t1bdg, c1bdg, t2bdg, c2bdg, float32_t cond1_factor = 1.0, float32_t cond2_factor = 1.0, float32_t pseudocount = 0.01, float32_t proportion_background_empirical_distribution = 0.99999 ):
1041         """
1042         t1bdg: a bedGraphTrackI object for treat 1
1043         c1bdg: a bedGraphTrackI object for control 1
1044         t2bdg: a bedGraphTrackI object for treat 2
1045         c2bdg: a bedGraphTrackI object for control 2
1046
1047         cond1_factor: this will be multiplied to values in t1bdg and c1bdg
1048         cond2_factor: this will be multiplied to values in t2bdg and c2bdg
1049
1050         pseudocount: pseudocount, by default 0.01.
1051
1052         proportion_background_empirical_distribution: proportion of genome as the background to build empirical distribution
1053
1054         """
1055
1056         self.data = {}           # for each chromosome, there is a l*4
1057                                  # matrix. First column: end position
1058                                  # of a region; Second: treatment
1059                                  # pileup; third: control pileup ;
1060                                  # forth: score ( can be
1061                                  # p/q-value/likelihood
1062                                  # ratio/fold-enrichment/subtraction
1063                                  # depending on -c setting)
1064         self.datalength = {}
1065         self.cond1_factor = cond1_factor
1066         self.cond2_factor = cond2_factor
1067         self.pseudocount = pseudocount
1068         self.pvalue_stat1 = {}
1069         self.pvalue_stat2 = {}
1070         self.t1bdg = t1bdg
1071         self.c1bdg = c1bdg
1072         self.t2bdg = t2bdg
1073         self.c2bdg = c2bdg
1074
1075         #self.empirical_distr_llr = [] # save all values in histogram
1076
1077     cpdef set_pseudocount( self, float32_t pseudocount ):
1078         self.pseudocount = pseudocount
1079
1080     cpdef build ( self ):
1081         """Compute scores from 3 types of comparisons and store them in self.data.
1082
1083         """
1084         cdef:
1085             set common_chrs
1086             bytes chrname
1087             int32_t chrom_max_len
1088         # common chromosome names
1089         common_chrs = self.get_common_chrs()
1090         for chrname in common_chrs:
1091             (cond1_treat_ps, cond1_treat_vs) = self.t1bdg.get_data_by_chr(chrname)
1092             (cond1_control_ps, cond1_control_vs) = self.c1bdg.get_data_by_chr(chrname)
1093             (cond2_treat_ps, cond2_treat_vs) = self.t2bdg.get_data_by_chr(chrname)
1094             (cond2_control_ps, cond2_control_vs) = self.c2bdg.get_data_by_chr(chrname)
1095             chrom_max_len = len(cond1_treat_ps) + len(cond1_control_ps) +\
1096                             len(cond2_treat_ps) + len(cond2_control_ps)
1097             self.add_chromosome( chrname, chrom_max_len )
1098             self.build_chromosome( chrname,
1099                                    cond1_treat_ps, cond1_control_ps,
1100                                    cond2_treat_ps, cond2_control_ps,
1101                                    cond1_treat_vs, cond1_control_vs,
1102                                    cond2_treat_vs, cond2_control_vs )
1103
1104
1105     cdef build_chromosome( self, chrname,
1106                            cond1_treat_ps, cond1_control_ps,
1107                            cond2_treat_ps, cond2_control_ps,
1108                            cond1_treat_vs, cond1_control_vs,
1109                            cond2_treat_vs, cond2_control_vs ):
1110         """Internal function to calculate scores for three types of comparisons.
1111
1112         cond1_treat_ps, cond1_control_ps: position of treat and control of condition 1
1113         cond2_treat_ps, cond2_control_ps: position of treat and control of condition 2
1114         cond1_treat_vs, cond1_control_vs: value of treat and control of condition 1
1115         cond2_treat_vs, cond2_control_vs: value of treat and control of condition 2
1116
1117         """
1118         cdef:
1119             int32_t c1tp, c1cp, c2tp, c2cp, minp, pre_p
1120             float32_t c1tv, c1cv, c2tv, c2cv
1121         c1tpn = iter(cond1_treat_ps).__next__
1122         c1cpn = iter(cond1_control_ps).__next__
1123         c2tpn = iter(cond2_treat_ps).__next__
1124         c2cpn = iter(cond2_control_ps).__next__
1125         c1tvn = iter(cond1_treat_vs).__next__
1126         c1cvn = iter(cond1_control_vs).__next__
1127         c2tvn = iter(cond2_treat_vs).__next__
1128         c2cvn = iter(cond2_control_vs).__next__
1129
1130         pre_p = 0
1131
1132         try:
1133             c1tp = c1tpn()
1134             c1tv = c1tvn()
1135
1136             c1cp = c1cpn()
1137             c1cv = c1cvn()
1138
1139             c2tp = c2tpn()
1140             c2tv = c2tvn()
1141
1142             c2cp = c2cpn()
1143             c2cv = c2cvn()
1144
1145             while True:
1146                 minp = min(c1tp, c1cp, c2tp, c2cp)
1147                 self.add( chrname, pre_p, c1tv, c1cv, c2tv, c2cv )
1148                 pre_p = minp
1149                 if c1tp == minp:
1150                     c1tp = c1tpn()
1151                     c1tv = c1tvn()
1152                 if c1cp == minp:
1153                     c1cp = c1cpn()
1154                     c1cv = c1cvn()
1155                 if c2tp == minp:
1156                     c2tp = c2tpn()
1157                     c2tv = c2tvn()
1158                 if c2cp == minp:
1159                     c2cp = c2cpn()
1160                     c2cv = c2cvn()
1161         except StopIteration:
1162             # meet the end of either bedGraphTrackI, simply exit
1163             pass
1164         return
1165
1166     cdef set get_common_chrs ( self ):
1167         cdef:
1168             set t1chrs, c1chrs, t2chrs, c2chrs, common
1169         t1chrs = self.t1bdg.get_chr_names()
1170         c1chrs = self.c1bdg.get_chr_names()
1171         t2chrs = self.t2bdg.get_chr_names()
1172         c2chrs = self.c2bdg.get_chr_names()
1173         common = reduce(lambda x,y:x.intersection(y), (t1chrs,c1chrs,t2chrs,c2chrs))
1174         return common
1175
1176     cdef add_chromosome ( self, bytes chrom, int32_t chrom_max_len ):
1177         """
1178         chrom: chromosome name
1179         chrom_max_len: maximum number of data points in this chromosome
1180
1181         """
1182         if chrom not in self.data:
1183             self.data[chrom] = [ np.zeros( chrom_max_len, dtype="int32" ), # pos
1184                                  np.zeros( chrom_max_len, dtype="float32" ), # LLR t1 vs c1
1185                                  np.zeros( chrom_max_len, dtype="float32" ), # LLR t2 vs c2
1186                                  np.zeros( chrom_max_len, dtype="float32" )] # LLR t1 vs t2
1187             self.datalength[chrom] = 0
1188
1189     cdef add (self, bytes chromosome, int32_t endpos, float32_t t1, float32_t c1, float32_t t2, float32_t c2):
1190         """Take chr-endpos-sample1-control1-sample2-control2 and
1191         compute logLR for t1 vs c1, t2 vs c2, and t1 vs t2, then save
1192         values.
1193
1194         chromosome: chromosome name in string
1195         endpos    : end position of each interval in integer
1196         t1        : Sample 1 ChIP pileup value of each interval in float
1197         c1        : Sample 1 Control pileup value of each interval in float
1198         t2        : Sample 2 ChIP pileup value of each interval in float
1199         c2        : Sample 2 Control pileup value of each interval in float
1200
1201         *Warning* Need to add regions continuously.
1202         """
1203         cdef:
1204             int32_t i
1205             list c
1206         i = self.datalength[chromosome]
1207         c = self.data[chromosome]
1208         c[0][ i ] = endpos
1209         c[1][ i ] = logLR_asym( (t1+self.pseudocount)*self.cond1_factor, (c1+self.pseudocount)*self.cond1_factor )
1210         c[2][ i ] = logLR_asym( (t2+self.pseudocount)*self.cond2_factor, (c2+self.pseudocount)*self.cond2_factor )
1211         c[3][ i ] = logLR_sym( (t1+self.pseudocount)*self.cond1_factor, (t2+self.pseudocount)*self.cond2_factor )
1212         self.datalength[chromosome] += 1
1213         return
1214
1215     cpdef finalize ( self ):
1216         """
1217         Adjust array size of each chromosome.
1218
1219         """
1220         cdef:
1221             bytes chrom
1222             int32_t l
1223             list d
1224
1225         for chrom in sorted(self.data.keys()):
1226             d = self.data[chrom]
1227             l = self.datalength[chrom]
1228             d[0].resize( l, refcheck = False )
1229             d[1].resize( l, refcheck = False )
1230             d[2].resize( l, refcheck = False )
1231             d[3].resize( l, refcheck = False )
1232         return
1233
1234     cpdef get_data_by_chr (self, bytes chromosome):
1235         """Return array of counts by chromosome.
1236
1237         The return value is a tuple:
1238         ([end pos],[value])
1239         """
1240         if chromosome in self.data:
1241             return self.data[chromosome]
1242         else:
1243             return None
1244
1245     cpdef get_chr_names (self):
1246         """Return all the chromosome names stored.
1247
1248         """
1249         l = set(self.data.keys())
1250         return l
1251
1252     cpdef write_bedGraph ( self, fhd, str name, str description, int32_t column = 3):
1253         """Write all data to fhd in bedGraph Format.
1254
1255         fhd: a filehandler to save bedGraph.
1256
1257         name/description: the name and description in track line.
1258
1259         colname: can be 1: cond1 chip vs cond1 ctrl, 2: cond2 chip vs cond2 ctrl, 3: cond1 chip vs cond2 chip
1260
1261         """
1262         cdef:
1263             bytes chrom
1264             int32_t l, pre, i, p
1265             float32_t pre_v, v
1266             np.ndarray pos, value
1267
1268         assert column in range( 1, 4 ), "column should be between 1, 2 or 3."
1269
1270         write = fhd.write
1271
1272         #if self.trackline:
1273         #    # this line is REQUIRED by the wiggle format for UCSC browser
1274         #    write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
1275
1276         chrs = self.get_chr_names()
1277         for chrom in sorted(chrs):
1278             pos = self.data[ chrom ][ 0 ]
1279             value = self.data[ chrom ][ column ]
1280             l = self.datalength[ chrom ]
1281             pre = 0
1282             if pos.shape[ 0 ] == 0: continue # skip if there's no data
1283             pre_v = value[ 0 ]
1284             for i in range( 1, l ):
1285                 v = value[ i ]
1286                 p = pos[ i-1 ]
1287                 if abs(pre_v - v)>=1e-6:
1288                     write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
1289                     pre_v = v
1290                     pre = p
1291             p = pos[ -1 ]
1292             # last one
1293             write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
1294
1295         return True
1296
1297     cpdef write_matrix ( self, fhd, str name, str description ):
1298         """Write all data to fhd into five columns Format:
1299
1300         col1: chr_start_end
1301         col2: t1 vs c1
1302         col3: t2 vs c2
1303         col4: t1 vs t2
1304
1305         fhd: a filehandler to save the matrix.
1306
1307         """
1308         cdef:
1309             bytes chrom
1310             int32_t l, pre, i, p
1311             float32_t v1, v2, v3
1312             np.ndarray pos, value1, value2, value3
1313
1314         write = fhd.write
1315
1316         chrs = self.get_chr_names()
1317         for chrom in sorted(chrs):
1318             [ pos, value1, value2, value3 ] = self.data[ chrom ]
1319             l = self.datalength[ chrom ]
1320             pre = 0
1321             if pos.shape[ 0 ] == 0: continue # skip if there's no data
1322             for i in range( 0, l ):
1323                 v1 = value1[ i ]
1324                 v2 = value2[ i ]
1325                 v3 = value3[ i ]
1326                 p = pos[ i ]
1327                 write( "%s:%d_%d\t%.5f\t%.5f\t%.5f\n" % ( chrom.decode(), pre, p, v1, v2, v3 ) )
1328                 pre = p
1329
1330         return True
1331
1332     cpdef tuple call_peaks (self, float32_t cutoff=3, int32_t min_length=200, int32_t max_gap = 100,
1333                       bool call_summits=False):
1334         """This function try to find regions within which, scores
1335         are continuously higher than a given cutoff.
1336
1337         For bdgdiff.
1338
1339         This function is NOT using sliding-windows. Instead, any
1340         regions in bedGraph above certain cutoff will be detected,
1341         then merged if the gap between nearby two regions are below
1342         max_gap. After this, peak is reported if its length is above
1343         min_length.
1344
1345         cutoff:  cutoff of value, default 3. For log10 LR, it means 1000 or -1000.
1346         min_length :  minimum peak length, default 200.
1347         max_gap   :  maximum gap to merge nearby peaks, default 100.
1348         ptrack:  an optional track for pileup heights. If it's not None, use it to find summits. Otherwise, use self/scoreTrack.
1349         """
1350         cdef:
1351             int32_t i
1352             bytes chrom
1353             np.ndarray pos, t1_vs_c1, t2_vs_c2, t1_vs_t2, \
1354                        cond1_over_cond2, cond2_over_cond1, cond1_equal_cond2, \
1355                        cond1_sig, cond2_sig,\
1356                        cat1, cat2, cat3, \
1357                        cat1_startpos, cat1_endpos, cat2_startpos, cat2_endpos, \
1358                        cat3_startpos, cat3_endpos
1359         chrs  = self.get_chr_names()
1360         cat1_peaks = PeakIO()       # dictionary to save peaks significant at condition 1
1361         cat2_peaks = PeakIO()       # dictionary to save peaks significant at condition 2
1362         cat3_peaks = PeakIO()       # dictionary to save peaks significant in both conditions
1363
1364         self.cutoff = cutoff
1365
1366         for chrom in sorted(chrs):
1367             pos = self.data[chrom][ 0 ]
1368             t1_vs_c1 = self.data[chrom][ 1 ]
1369             t2_vs_c2 = self.data[chrom][ 2 ]
1370             t1_vs_t2 = self.data[chrom][ 3 ]
1371             and_ = np.logical_and
1372             cond1_over_cond2 = t1_vs_t2 >= cutoff # regions with stronger cond1 signals
1373             cond2_over_cond1 = t1_vs_t2 <= -1*cutoff # regions with stronger cond2 signals
1374             cond1_equal_cond2= and_( t1_vs_t2 >= -1*cutoff, t1_vs_t2 <= cutoff )
1375             cond1_sig = t1_vs_c1 >= cutoff # enriched regions in condition 1
1376             cond2_sig = t2_vs_c2 >= cutoff # enriched regions in condition 2
1377             # indices where score is above cutoff
1378             cat1 = np.where( and_( cond1_sig, cond1_over_cond2 ) )[ 0 ] # cond1 stronger than cond2, the indices
1379             cat2 = np.where( and_( cond2_over_cond1, cond2_sig ) )[ 0 ] # cond2 stronger than cond1, the indices
1380             cat3 = np.where( and_( and_( cond1_sig, cond2_sig ), # cond1 and cond2 are equal, the indices
1381                                    cond1_equal_cond2 ) ) [ 0 ]
1382
1383             cat1_endpos = pos[cat1] # end positions of regions where score is above cutoff
1384             cat1_startpos = pos[cat1-1] # start positions of regions where score is above cutoff
1385             cat2_endpos = pos[cat2] # end positions of regions where score is above cutoff
1386             cat2_startpos = pos[cat2-1] # start positions of regions where score is above cutoff
1387             cat3_endpos = pos[cat3] # end positions of regions where score is above cutoff
1388             cat3_startpos = pos[cat3-1] # start positions of regions where score is above cutoff
1389
1390             # for cat1: condition 1 stronger regions
1391             self.__add_a_peak ( cat1_peaks, chrom, cat1, cat1_startpos, cat1_endpos, t1_vs_t2, max_gap, min_length )
1392             # for cat2: condition 2 stronger regions
1393             self.__add_a_peak ( cat2_peaks, chrom, cat2, cat2_startpos, cat2_endpos, -1 * t1_vs_t2, max_gap, min_length )
1394             # for cat3: commonly strong regions
1395             self.__add_a_peak ( cat3_peaks, chrom, cat3, cat3_startpos, cat3_endpos, abs(t1_vs_t2), max_gap, min_length )
1396
1397         return (cat1_peaks, cat2_peaks, cat3_peaks)
1398
1399     cdef object __add_a_peak ( self, object peaks, bytes chrom, np.ndarray indices, np.ndarray startpos, np.ndarray endpos,
1400                                np.ndarray score, int32_t max_gap, int32_t min_length ):
1401          """For a given chromosome, merge nearby significant regions,
1402          filter out smaller regions, then add regions to PeakIO
1403          object.
1404
1405          """
1406          cdef:
1407              int32_t i
1408              list peak_content
1409              float32_t mean_logLR
1410
1411          if startpos.size > 0:
1412              # if it is not empty
1413              peak_content = []
1414              if indices[0] == 0:
1415                  # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1416                  startpos[0] = 0
1417              # first bit of region above cutoff
1418              peak_content.append( (startpos[0], endpos[0], score[indices[ 0 ]]) )
1419              for i in range( 1, startpos.size ):
1420                  if startpos[i] - peak_content[-1][1] <= max_gap:
1421                      # append
1422                      peak_content.append( ( startpos[i], endpos[i], score[indices[ i ]] ) )
1423                  else:
1424                      # close
1425                      if peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] >= min_length:
1426                          mean_logLR = self.mean_from_peakcontent( peak_content )
1427                          #if peak_content[0][0] == 22414956:
1428                          #    print(f"{peak_content} {mean_logLR}")
1429                          peaks.add( chrom, peak_content[0][0], peak_content[-1][1],
1430                                     summit = -1, peak_score  = mean_logLR, pileup = 0, pscore = 0,
1431                                     fold_change = 0, qscore = 0,
1432                                     )
1433                      peak_content = [(startpos[i], endpos[i], score[ indices[ i ] ]),]
1434
1435              # save the last peak
1436              if peak_content:
1437                  if peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] >= min_length:
1438                      mean_logLR = self.mean_from_peakcontent( peak_content )
1439                      peaks.add( chrom, peak_content[0][0], peak_content[-1][1],
1440                                 summit = -1, peak_score  = mean_logLR, pileup = 0, pscore = 0,
1441                                 fold_change = 0, qscore = 0,
1442                                 )
1443
1444          return
1445
1446     cdef float32_t mean_from_peakcontent ( self, list peakcontent ):
1447         """
1448
1449         """
1450         cdef:
1451             int32_t tmp_s, tmp_e
1452             int32_t l
1453             float64_t tmp_v, sum_v        #for better precision
1454             float32_t r
1455             int32_t i
1456
1457         l = 0
1458         sum_v = 0                         #initialize sum_v as 0
1459         for i in range( len(peakcontent) ):
1460             tmp_s = peakcontent[i][0]
1461             tmp_e = peakcontent[i][1]
1462             tmp_v = peakcontent[i][2]
1463             sum_v += tmp_v * ( tmp_e - tmp_s )
1464             l +=  tmp_e - tmp_s
1465
1466         r = <float32_t>( sum_v / l )
1467         return r
1468
1469
1470     cdef int64_t total ( self ):
1471         """Return the number of regions in this object.
1472
1473         """
1474         cdef:
1475             int64_t t
1476             bytes chrom
1477
1478         t = 0
1479         for chrom in sorted(self.data.keys()):
1480             t += self.datalength[chrom]
1481         return t
1482
1483