MACS3/Signal/BedGraph.py

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2024-10-14 19:32:34 Tao Liu>
   4
   5 """Module for BedGraph data class.
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file LICENSE included with
   9 the distribution).
  10 """
  11
  12 # ------------------------------------
  13 # python modules
  14 # ------------------------------------
  15 import cython
  16 from array import array as pyarray
  17 from math import prod
  18 # ------------------------------------
  19 # MACS3 modules
  20 # ------------------------------------
  21 from MACS3.Signal.ScoreTrack import ScoreTrackII
  22 from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, PeakContent
  23 from MACS3.Signal.Prob import chisq_logp_e
  24
  25 # ------------------------------------
  26 # Other modules
  27 # ------------------------------------
  28
  29 from cython.cimports.cpython import bool
  30 import numpy as np
  31 import cython.cimports.numpy as cnp
  32
  33 # ------------------------------------
  34 # C lib
  35 # ------------------------------------
  36
  37 from cython.cimports.libc.math import sqrt, log10
  38
  39 # ------------------------------------
  40 # constants
  41 # ------------------------------------
  42 LOG10_E = 0.43429448190325176
  43
  44 # ------------------------------------
  45 # Misc functions
  46 # ------------------------------------
  47
  48
  49 @cython.inline
  50 @cython.cfunc
  51 def mean_func(x):
  52     return sum(x)/len(x)
  53
  54
  55 @cython.inline
  56 @cython.cfunc
  57 def fisher_func(x):
  58     # combine -log10pvalues
  59     return chisq_logp_e(2*sum(x)/LOG10_E, 2*len(x), log10=True)
  60
  61
  62 @cython.inline
  63 @cython.cfunc
  64 def subtract_func(x):
  65     # subtraction of two items list
  66     return x[1] - x[0]
  67
  68
  69 @cython.inline
  70 @cython.cfunc
  71 def divide_func(x):
  72     # division of two items list
  73     return x[1] / x[2]
  74
  75
  76 @cython.inline
  77 @cython.cfunc
  78 def product_func(x):
  79     # production of a list of values
  80     # only python 3.8 or above
  81     return prod(x)
  82
  83 # ------------------------------------
  84 # Classes
  85 # ------------------------------------
  86
  87
  88 @cython.cclass
  89 class bedGraphTrackI:
  90     """Class for bedGraph type data.
  91
  92     In bedGraph, data are represented as continuous non-overlapping
  93     regions in the whole genome. I keep this assumption in all the
  94     functions. If data has overlaps, some functions will definitely
  95     give incorrect results.
  96
  97     1. Continuous: the next region should be after the previous one
  98     unless they are on different chromosomes;
  99
 100     2. Non-overlapping: the next region should never have overlaps
 101     with preceding region.
 102
 103     The way to memorize bedGraph data is to remember the transition
 104     points together with values of their preceding regions. The last
 105     data point may exceed chromosome end, unless a chromosome
 106     dictionary is given. Remember the coordinations in bedGraph and
 107     this class is 0-indexed and right-open.
 108
 109     """
 110     __data: dict
 111     maxvalue = cython.declare(cython.float, visibility="public")
 112     minvalue = cython.declare(cython.float, visibility="public")
 113     baseline_value = cython.declare(cython.float, visibility="public")
 114
 115     def __init__(self, baseline_value: cython.float = 0):
 116         """
 117         baseline_value is the value to fill in the regions not defined
 118         in bedGraph. For example, if the bedGraph is like:
 119
 120         chr1  100 200  1
 121         chr1  250 350  2
 122
 123         Then the region chr1:200..250 should be filled with baseline_value.
 124
 125         """
 126         self.__data = {}
 127         self.maxvalue = -10000000  # initial maximum value is tiny since I want safe_add_loc to update it
 128         self.minvalue = 10000000  # initial minimum value is large since I want safe_add_loc to update it
 129         self.baseline_value = baseline_value
 130
 131     @cython.ccall
 132     def add_loc(self, chromosome: bytes,
 133                 startpos: cython.int,
 134                 endpos: cython.int,
 135                 value: cython.float):
 136         """Add a chr-start-end-value block into __data dictionary.
 137
 138         Note, we don't check if the add_loc is called continuously on
 139         sorted regions without any gap. So we only suggest calling
 140         this function within MACS.
 141
 142         """
 143         pre_v: cython.float
 144
 145         # basic assumption, end pos should > start pos
 146
 147         if endpos <= 0:
 148             return
 149         if startpos < 0:
 150             startpos = 0
 151
 152         if chromosome not in self.__data:
 153             self.__data[chromosome] = [pyarray('i', []),
 154                                        pyarray('f', [])]
 155             c = self.__data[chromosome]
 156             if startpos:
 157                 # start pos is not 0, then add two blocks, the first
 158                 # with "baseline_value"; the second with "value"
 159                 c[0].append(startpos)
 160                 c[1].append(self.baseline_value)
 161             c[0].append(endpos)
 162             c[1].append(value)
 163         else:
 164             c = self.__data[chromosome]
 165             # get the preceding region
 166             pre_v = c[1][-1]
 167
 168             # if this region is next to the previous one.
 169             if pre_v == value:
 170                 # if value is the same, simply extend it.
 171                 c[0][-1] = endpos
 172             else:
 173                 # otherwise, add a new region
 174                 c[0].append(endpos)
 175                 c[1].append(value)
 176
 177         if value > self.maxvalue:
 178             self.maxvalue = value
 179         if value < self.minvalue:
 180             self.minvalue = value
 181
 182     @cython.ccall
 183     def add_loc_wo_merge(self, chromosome: bytes,
 184                          startpos: cython.int,
 185                          endpos: cython.int,
 186                          value: cython.float):
 187         """Add a chr-start-end-value block into __data dictionary.
 188
 189         Note, we don't check if the add_loc is called continuously on
 190         sorted regions without any gap. So we only suggest calling
 191         this function within MACS.
 192
 193         This one won't merge nearby ranges with the same value
 194         """
 195         if endpos <= 0:
 196             return
 197         if startpos < 0:
 198             startpos = 0
 199
 200         if value < self.baseline_value:
 201             value = self.baseline_value
 202
 203         if chromosome not in self.__data:
 204             self.__data[chromosome] = [pyarray('i', []),
 205                                        pyarray('f', [])]
 206             c = self.__data[chromosome]
 207             if startpos:
 208                 # start pos is not 0, then add two blocks, the first
 209                 # with "baseline_value"; the second with "value"
 210                 c[0].append(startpos)
 211                 c[1].append(self.baseline_value)
 212         c = self.__data[chromosome]
 213         c[0].append(endpos)
 214         c[1].append(value)
 215         if value > self.maxvalue:
 216             self.maxvalue = value
 217         if value < self.minvalue:
 218             self.minvalue = value
 219
 220     @cython.ccall
 221     def add_chrom_data(self,
 222                        chromosome: bytes,
 223                        p: pyarray,
 224                        v: pyarray):
 225         """Add a pv data to a chromosome. Replace the previous data.
 226
 227         p: a pyarray object 'i' for positions
 228         v: a pyarray object 'f' for values
 229
 230         Note: no checks for error, use with caution
 231         """
 232         maxv: cython.float
 233         minv: cython.float
 234
 235         self.__data[chromosome] = [p, v]
 236         maxv = max(v)
 237         minv = min(v)
 238         if maxv > self.maxvalue:
 239             self.maxvalue = maxv
 240         if minv < self.minvalue:
 241             self.minvalue = minv
 242         return
 243
 244     @cython.ccall
 245     def add_chrom_data_PV(self,
 246                           chromosome: bytes,
 247                           pv: cnp.ndarray):
 248         """Add a pv data to a chromosome. Replace the previous data.
 249
 250         This is a kinda silly function to waste time and convert a PV
 251         array (2-d named numpy array) into two python arrays for this
 252         BedGraph class. May have better function later.
 253
 254         Note: no checks for error, use with caution
 255         """
 256         maxv: cython.float
 257         minv: cython.float
 258
 259         self.__data[chromosome] = [pyarray('i', pv['p']),
 260                                    pyarray('f', pv['v'])]
 261         minv = pv['v'].min()
 262         maxv = pv['p'].max()
 263         if maxv > self.maxvalue:
 264             self.maxvalue = maxv
 265         if minv < self.minvalue:
 266             self.minvalue = minv
 267         return
 268
 269     @cython.ccall
 270     def destroy(self) -> bool:
 271         """ destroy content, free memory.
 272         """
 273         chrs: set
 274         chrom: bytes
 275
 276         chrs = self.get_chr_names()
 277         for chrom in sorted(chrs):
 278             if chrom in self.__data:
 279                 self.__data[chrom] = [None, None]
 280                 self.__data.pop(chrom)
 281         return True
 282
 283     @cython.ccall
 284     def get_data_by_chr(self, chromosome: bytes) -> list:
 285         """Return array of counts by chromosome.
 286
 287         The return value is a tuple:
 288         ([end pos],[value])
 289         """
 290         if chromosome in self.__data:
 291             return self.__data[chromosome]
 292         else:
 293             return []
 294
 295     @cython.ccall
 296     def get_chr_names(self) -> set:
 297         """Return all the chromosome names stored.
 298
 299         """
 300         return set(sorted(self.__data.keys()))
 301
 302     @cython.ccall
 303     def reset_baseline(self, baseline_value: cython.float):
 304         """Reset baseline value to baseline_value.
 305
 306         So any region between self.baseline_value and baseline_value
 307         will be set to baseline_value.
 308
 309         """
 310         self.baseline_value = baseline_value
 311         self.filter_score(cutoff=baseline_value)
 312         self.merge_regions()
 313         return
 314
 315     @cython.cfunc
 316     def merge_regions(self):
 317         """Merge nearby regions with the same value.
 318
 319         """
 320         # new_pre_pos: cython.int
 321         pos: cython.int
 322         i: cython.int
 323         new_pre_value: cython.float
 324         value: cython.float
 325         chrom: bytes
 326         chrs: set
 327
 328         chrs = self.get_chr_names()
 329         for chrom in sorted(chrs):
 330             (p, v) = self.__data[chrom]
 331             pnext = iter(p).__next__
 332             vnext = iter(v).__next__
 333
 334             # new arrays
 335             new_pos = pyarray('L', [pnext(),])
 336             new_value = pyarray('f', [vnext(),])
 337
 338             newpa = new_pos.append
 339             newva = new_value.append
 340
 341             # new_pre_pos = new_pos[0]
 342             new_pre_value = new_value[0]
 343
 344             for i in range(1, len(p)):
 345                 pos = pnext()
 346                 value = vnext()
 347                 if value == new_pre_value:
 348                     new_pos[-1] = pos
 349                 else:
 350                     # add new region
 351                     newpa(pos)
 352                     newva(value)
 353                     # new_pre_pos = pos
 354                     new_pre_value = value
 355             self.__data[chrom] = [new_pos, new_value]
 356         return True
 357
 358     @cython.ccall
 359     def filter_score(self, cutoff: cython.float = 0) -> bool:
 360         """Filter using a score cutoff. Any region lower than score
 361         cutoff will be set to self.baseline_value.
 362
 363         Self will be modified.
 364         """
 365         # new_pre_pos: cython.int
 366         pos: cython.int
 367         i: cython.int
 368         new_pre_value: cython.float
 369         value: cython.float
 370         chrom: bytes
 371         chrs: set
 372
 373         chrs = self.get_chr_names()
 374         for chrom in sorted(chrs):
 375             (p, v) = self.__data[chrom]
 376             pnext = iter(p).__next__
 377             vnext = iter(v).__next__
 378
 379             # new arrays
 380             new_pos = pyarray('L', [])
 381             new_value = pyarray('f', [])
 382             # new_pre_pos = 0
 383             new_pre_value = 0
 384
 385             for i in range(len(p)):
 386                 pos = pnext()
 387                 value = vnext()
 388
 389                 if value < cutoff:
 390                     # this region will be set to baseline_value
 391                     if new_pre_value == self.baseline_value:
 392                         # if preceding region is at baseline, extend it
 393                         new_pos[-1] = pos
 394                     else:
 395                         # else add a new baseline region
 396                         new_pos.append(pos)
 397                         new_value.append(self.baseline_value)
 398                 else:
 399                     # put it into new arrays
 400                     new_pos.append(pos)
 401                     new_value.append(value)
 402                 # new_pre_pos = new_pos[-1]
 403                 new_pre_value = new_value[-1]
 404             self.__data[chrom] = [new_pos, new_value]
 405         return True
 406
 407     @cython.ccall
 408     def summary(self) -> tuple:
 409         """Calculate the sum, total_length, max, min, mean, and std.
 410
 411         Return a tuple for (sum, total_length, max, min, mean, std).
 412
 413         """
 414         n_v: cython.long
 415         sum_v: cython.float
 416         max_v: cython.float
 417         min_v: cython.float
 418         mean_v: cython.float
 419         variance: cython.float
 420         tmp: cython.float
 421         std_v: cython.float
 422         pre_p: cython.int
 423         ln: cython.int
 424         i: cython.int
 425
 426         pre_p = 0
 427         n_v = 0
 428         sum_v = 0
 429         max_v = -100000
 430         min_v = 100000
 431         for (p, v) in self.__data.values():
 432             # for each chromosome
 433             pre_p = 0
 434             for i in range(len(p)):
 435                 # for each region
 436                 ln = p[i]-pre_p
 437                 sum_v += v[i]*ln
 438                 n_v += ln
 439                 pre_p = p[i]
 440             max_v = max(max(v), max_v)
 441             min_v = min(min(v), min_v)
 442         mean_v = sum_v/n_v
 443         variance = 0.0
 444         for (p, v) in self.__data.values():
 445             for i in range(len(p)):
 446                 # for each region
 447                 tmp = v[i]-mean_v
 448                 ln = p[i]-pre_p
 449                 variance += tmp*tmp*ln
 450                 pre_p = p[i]
 451
 452         variance /= float(n_v-1)
 453         std_v = sqrt(variance)
 454         return (sum_v, n_v, max_v, min_v, mean_v, std_v)
 455
 456     @cython.ccall
 457     def call_peaks(self,
 458                    cutoff: cython.float = 1,
 459                    min_length: cython.int = 200,
 460                    max_gap: cython.int = 50,
 461                    call_summits: bool = False):
 462         """This function try to find regions within which, scores
 463         are continuously higher than a given cutoff.
 464
 465         This function is NOT using sliding-windows. Instead, any
 466         regions in bedGraph above certain cutoff will be detected,
 467         then merged if the gap between nearby two regions are below
 468         max_gap. After this, peak is reported if its length is above
 469         min_length.
 470
 471         cutoff:  cutoff of value, default 1.
 472         min_length :  minimum peak length, default 200.
 473         gap   :  maximum gap to merge nearby peaks, default 50.
 474
 475         Removed option:
 476
 477         up_limit: the highest acceptable value. Default 10^{310}
 478           * so only allow peak with value >=cutoff and <=up_limit
 479
 480         This does not work. The region above upper limit may still be
 481         included as `gap` .
 482
 483         """
 484         # peak_length: cython.int
 485         x: cython.int
 486         pre_p: cython.int
 487         p: cython.int
 488         i: cython.int
 489         v: cython.float
 490         chrom: bytes
 491         chrs: set
 492
 493         chrs = self.get_chr_names()
 494         peaks = PeakIO()                      # dictionary to save peaks
 495         for chrom in sorted(chrs):
 496             peak_content = None
 497             # peak_length = 0
 498             (ps, vs) = self.get_data_by_chr(chrom)  # arrays for position and values
 499             psn = iter(ps).__next__         # assign the next function to a viable to speed up
 500             vsn = iter(vs).__next__
 501             x = 0
 502             pre_p = 0                   # remember previous position
 503             while True:
 504                 # find the first region above cutoff
 505                 try:                    # try to read the first data range for this chrom
 506                     p = psn()
 507                     v = vsn()
 508                 except Exception:
 509                     break
 510                 x += 1                  # index for the next point
 511                 if v >= cutoff:
 512                     peak_content = [(pre_p, p, v),]
 513                     pre_p = p
 514                     break               # found the first range above cutoff
 515                 else:
 516                     pre_p = p
 517
 518             for i in range(x, len(ps)):
 519                 # continue scan the rest regions
 520                 p = psn()
 521                 v = vsn()
 522                 if v < cutoff:  # not be detected as 'peak'
 523                     pre_p = p
 524                     continue
 525                 # for points above cutoff
 526                 # if the gap is allowed
 527                 if pre_p - peak_content[-1][1] <= max_gap:
 528                     peak_content.append((pre_p, p, v))
 529                 else:
 530                     # when the gap is not allowed, close this peak
 531                     self.__close_peak(peak_content,
 532                                       peaks,
 533                                       min_length,
 534                                       chrom)  # , smoothlen=max_gap / 2)
 535                     # start a new peak
 536                     peak_content = [(pre_p, p, v),]
 537                 pre_p = p
 538
 539             # save the last peak
 540             if not peak_content:
 541                 continue
 542             self.__close_peak(peak_content,
 543                               peaks,
 544                               min_length,
 545                               chrom)  # , smoothlen=max_gap / 2)
 546         return peaks
 547
 548     @cython.cfunc
 549     def __close_peak(self,
 550                      peak_content: list,
 551                      peaks,
 552                      min_length: cython.int,
 553                      chrom: bytes) -> bool:
 554         tsummit: list           # list for temporary summits
 555         peak_length: cython.int
 556         summit: cython.int
 557         tstart: cython.int
 558         tend: cython.int
 559         summit_value: cython.float
 560         tvalue: cython.float
 561         peak_length = peak_content[-1][1]-peak_content[0][0]
 562         if peak_length >= min_length:  # if the peak is too small, reject it
 563             tsummit = []
 564             summit = 0
 565             summit_value = 0
 566             for (tstart, tend, tvalue) in peak_content:
 567                 if not summit_value or summit_value < tvalue:
 568                     tsummit = [cython.cast(cython.int, (tend+tstart)/2),]
 569                     summit_value = tvalue
 570                 elif summit_value == tvalue:
 571                     tsummit.append(cython.cast(cython.int, (tend+tstart)/2))
 572             summit = tsummit[cython.cast(cython.int, (len(tsummit)+1)/2)-1]
 573             peaks.add(chrom,
 574                       peak_content[0][0],
 575                       peak_content[-1][1],
 576                       summit=summit,
 577                       peak_score=summit_value,
 578                       pileup=0,
 579                       pscore=0,
 580                       fold_change=0,
 581                       qscore=0
 582                       )
 583             return True
 584
 585     @cython.ccall
 586     def call_broadpeaks(self,
 587                         lvl1_cutoff: cython.float = 500,
 588                         lvl2_cutoff: cython.float = 100,
 589                         min_length: cython.int = 200,
 590                         lvl1_max_gap: cython.int = 50,
 591                         lvl2_max_gap: cython.int = 400):
 592         """This function try to find enriched regions within which,
 593         scores are continuously higher than a given cutoff for level
 594         1, and link them using the gap above level 2 cutoff with a
 595         maximum length of lvl2_max_gap.
 596
 597         lvl1_cutoff:  cutoff of value at enriched regions, default 500.
 598         lvl2_cutoff:  cutoff of value at linkage regions, default 100.
 599         min_length :  minimum peak length, default 200.
 600         lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
 601         lvl2_max_gap   :  maximum length of linkage regions, default 400.
 602         colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column.
 603
 604         Return both general PeakIO object for highly enriched regions
 605         and gapped broad regions in BroadPeakIO.
 606         """
 607         chrom: bytes
 608         i: cython.int
 609         j: cython.int
 610         chrs: set
 611         lvl1: PeakContent
 612         lvl2: PeakContent   # PeakContent class object
 613         lvl1peakschrom: list
 614         lvl2peakschrom: list
 615
 616         assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
 617         assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
 618         lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff,
 619                                      min_length=min_length,
 620                                      max_gap=lvl1_max_gap,
 621                                      call_summits=False)
 622         lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff,
 623                                      min_length=min_length,
 624                                      max_gap=lvl2_max_gap,
 625                                      call_summits=False)
 626         chrs = lvl1_peaks.get_chr_names()
 627         broadpeaks = BroadPeakIO()
 628         # use lvl2_peaks as linking regions between lvl1_peaks
 629         for chrom in sorted(chrs):
 630             lvl1peakschrom = lvl1_peaks.get_data_from_chrom(chrom)
 631             lvl2peakschrom = lvl2_peaks.get_data_from_chrom(chrom)
 632             lvl1peakschrom_next = iter(lvl1peakschrom).__next__
 633             tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
 634             # our assumption is lvl1 regions should be included in lvl2 regions
 635             try:
 636                 lvl1 = lvl1peakschrom_next()
 637                 for i in range(len(lvl2peakschrom)):
 638                     # for each lvl2 peak, find all lvl1 peaks inside
 639                     lvl2 = lvl2peakschrom[i]
 640                     while True:
 641                         if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
 642                             tmppeakset.append(lvl1)
 643                             lvl1 = lvl1peakschrom_next()
 644                         else:
 645                             self.__add_broadpeak(broadpeaks,
 646                                                  chrom,
 647                                                  lvl2,
 648                                                  tmppeakset)
 649                             tmppeakset = []
 650                             break
 651             except StopIteration:
 652                 self.__add_broadpeak(broadpeaks, chrom, lvl2, tmppeakset)
 653                 tmppeakset = []
 654                 for j in range(i+1, len(lvl2peakschrom)):
 655                     self.__add_broadpeak(broadpeaks,
 656                                          chrom,
 657                                          lvl2peakschrom[j],
 658                                          tmppeakset)
 659         return broadpeaks
 660
 661     @cython.cfunc
 662     def __add_broadpeak(self,
 663                         bpeaks,
 664                         chrom: bytes,
 665                         lvl2peak: PeakContent,
 666                         lvl1peakset: list):
 667         """Internal function to create broad peak.
 668         """
 669         start: cython.int
 670         end: cython.int
 671         blockNum: cython.int
 672         blockSizes: bytes
 673         blockStarts: bytes
 674         thickStart: bytes
 675         thickEnd: bytes
 676
 677         start = lvl2peak["start"]
 678         end = lvl2peak["end"]
 679
 680         # the following code will add those broad/lvl2 peaks with no
 681         # strong/lvl1 peaks inside
 682         if not lvl1peakset:
 683             # try:
 684             # will complement by adding 1bps start and end to this region
 685             # may change in the future if gappedPeak format was improved.
 686             bpeaks.add(chrom, start, end,
 687                        score=lvl2peak["score"],
 688                        thickStart=(b"%d" % start),
 689                        thickEnd=(b"%d" % end),
 690                        blockNum=2,
 691                        blockSizes=b"1,1",
 692                        blockStarts=(b"0,%d" % (end-start-1)),
 693                        pileup=lvl2peak["pileup"],
 694                        pscore=lvl2peak["pscore"],
 695                        fold_change=lvl2peak["fc"],
 696                        qscore=lvl2peak["qscore"])
 697             return bpeaks
 698
 699         thickStart = b"%d" % lvl1peakset[0]["start"]
 700         thickEnd = b"%d" % lvl1peakset[-1]["end"]
 701         blockNum = len(lvl1peakset)
 702         blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset])
 703         blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset])
 704
 705         if int(thickStart) != start:
 706             # add 1bp left block
 707             thickStart = b"%d" % start
 708             blockNum += 1
 709             blockSizes = b"1,"+blockSizes
 710             blockStarts = b"0,"+blockStarts
 711         if int(thickEnd) != end:
 712             # add 1bp right block
 713             thickEnd = b"%d" % end
 714             blockNum += 1
 715             blockSizes = blockSizes+b",1"
 716             blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
 717
 718         bpeaks.add(chrom, start, end,
 719                    score=lvl2peak["score"],
 720                    thickStart=thickStart,
 721                    thickEnd=thickEnd,
 722                    blockNum=blockNum,
 723                    blockSizes=blockSizes,
 724                    blockStarts=blockStarts,
 725                    pileup=lvl2peak["pileup"],
 726                    pscore=lvl2peak["pscore"],
 727                    fold_change=lvl2peak["fc"],
 728                    qscore=lvl2peak["qscore"])
 729         return bpeaks
 730
 731     @cython.ccall
 732     def refine_peaks(self, peaks):
 733         """This function try to based on given peaks, re-evaluate the
 734         peak region, call the summit.
 735
 736         peaks: PeakIO object
 737         return: a new PeakIO object
 738
 739         """
 740         pre_p: cython.int
 741         p: cython.int
 742         peak_s: cython.int
 743         peak_e: cython.int
 744         v: cython.float
 745         chrom: bytes
 746         chrs: set
 747
 748         peaks.sort()
 749         new_peaks = PeakIO()
 750         chrs = self.get_chr_names()
 751         assert isinstance(peaks, PeakIO)
 752         chrs = chrs.intersection(set(peaks.get_chr_names()))
 753
 754         for chrom in sorted(chrs):
 755             peaks_chr = peaks.get_data_from_chrom(chrom)
 756             peak_content = []
 757             # arrays for position and values
 758             (ps, vs) = self.get_data_by_chr(chrom)
 759             # assign the next function to a viable to speed up
 760             psn = iter(ps).__next__
 761             vsn = iter(vs).__next__
 762             peakn = iter(peaks_chr).__next__
 763
 764             # remember previous position in bedgraph/self
 765             pre_p = 0
 766             p = psn()
 767             v = vsn()
 768             peak = peakn()
 769             peak_s = peak["start"]
 770             peak_e = peak["end"]
 771             while True:
 772                 # look for overlap
 773                 if p > peak_s and peak_e > pre_p:
 774                     # now put four coordinates together and pick the middle two
 775                     s, e = sorted([p, peak_s, peak_e, pre_p])[1:3]
 776                     # add this content
 777                     peak_content.append((s, e, v))
 778                     # move self/bedGraph
 779                     try:
 780                         pre_p = p
 781                         p = psn()
 782                         v = vsn()
 783                     except Exception:
 784                         # no more value chunk in bedGraph
 785                         break
 786                 elif pre_p >= peak_e:
 787                     # close peak
 788                     self.__close_peak(peak_content, new_peaks, 0, chrom)
 789                     peak_content = []
 790                     # move peak
 791                     try:
 792                         peak = peakn()
 793                         peak_s = peak["start"]
 794                         peak_e = peak["end"]
 795                     except Exception:
 796                         # no more peak
 797                         break
 798                 elif peak_s >= p:
 799                     # move self/bedgraph
 800                     try:
 801                         pre_p = p
 802                         p = psn()
 803                         v = vsn()
 804                     except Exception:
 805                         # no more value chunk in bedGraph
 806                         break
 807                 else:
 808                     raise Exception(f"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}")
 809
 810             # save the last peak
 811             if peak_content:
 812                 self.__close_peak(peak_content, new_peaks, 0, chrom)
 813         return new_peaks
 814
 815     @cython.ccall
 816     def total(self) -> cython.int:
 817         """Return the number of regions in this object.
 818
 819         """
 820         t: cython.int
 821         t = 0
 822         for (p, v) in self.__data.values():
 823             t += len(p)
 824         return t
 825
 826     @cython.ccall
 827     def set_single_value(self, new_value: cython.float):
 828         """Change all the values in bedGraph to the same new_value,
 829         return a new bedGraphTrackI.
 830
 831         """
 832         chrom: bytes
 833         max_p: cython.int
 834
 835         ret = bedGraphTrackI()
 836         chroms = set(self.get_chr_names())
 837         for chrom in sorted(chroms):
 838             # arrays for position and values
 839             (p1, v1) = self.get_data_by_chr(chrom)
 840             # maximum p
 841             max_p = max(p1)
 842             # add a region from 0 to max_p
 843             ret.add_loc(chrom, 0, max_p, new_value)
 844         return ret
 845
 846     @cython.ccall
 847     def overlie(self, bdgTracks, func: str = "max"):
 848         """Calculate two or more bedGraphTrackI objects by letting self
 849         overlying bdgTrack2, with user-defined functions.
 850
 851         Transition positions from both bedGraphTrackI objects will be
 852         considered and combined. For example:
 853
 854            #1 bedGraph (self)   |      #2 bedGraph
 855         -----------------------------------------------
 856         chr1  0    100  0       | chr1    0    150  1
 857         chr1  100  200  3       | chr1    150  250  2
 858         chr1  200  300  4       | chr1    250  300  4
 859
 860         these two bedGraphs will be combined to have five transition
 861         points: 100, 150, 200, 250, and 300. So in order to calculate
 862         two bedGraphs, I pair values within the following regions
 863         like:
 864
 865         chr   s   e     (#1,#2)   applied_func_max
 866         -----------------------------------------------
 867         chr1  0   100   (0,1)     1
 868         chr1  100 150   (3,1)     3
 869         chr1  150 200   (3,2)     3
 870         chr1  200 250   (4,2)     4
 871         chr1  250 300   (4,4)     4
 872
 873         Then the given 'func' will be applied on each 2-tuple as func(#1,#2)
 874
 875         Supported 'func' are "sum", "subtract" (only for two bdg
 876         objects), "product", "divide" (only for two bdg objects),
 877         "max", "mean" and "fisher".
 878
 879         Return value is a new bedGraphTrackI object.
 880
 881         Option: bdgTracks can be a list of bedGraphTrackI objects
 882         """
 883         pre_p: cython.int
 884         chrom: bytes
 885
 886         nr_tracks = len(bdgTracks) + 1  # +1 for self
 887         assert nr_tracks >= 2, "Specify at least one more bdg objects."
 888         for i, bdgTrack in enumerate(bdgTracks):
 889             assert isinstance(bdgTrack, bedGraphTrackI), "bdgTrack{} is not a bedGraphTrackI object".format(i + 1)
 890
 891         if func == "max":
 892             f = max
 893         elif func == "mean":
 894             f = mean_func
 895         elif func == "fisher":
 896             f = fisher_func
 897         elif func == "sum":
 898             f = sum
 899         elif func == "product":
 900             f = product_func
 901         elif func == "subtract":
 902             if nr_tracks == 2:
 903                 f = subtract_func
 904             else:
 905                 raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
 906         elif func == "divide":
 907             if nr_tracks == 2:
 908                 f = divide_func
 909             else:
 910                 raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
 911         else:
 912             raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
 913
 914         ret = bedGraphTrackI()
 915
 916         common_chr = set(self.get_chr_names())
 917         for track in bdgTracks:
 918             common_chr = common_chr.intersection(set(track.get_chr_names()))
 919
 920         for chrom in sorted(common_chr):
 921             datas = [self.get_data_by_chr(chrom)]
 922             datas.extend([bdgTracks[i].get_data_by_chr(chrom) for i in range(len(bdgTracks))])
 923
 924             ps, vs, pn, vn = [], [], [], []
 925             for data in datas:
 926                 ps.append(data[0])
 927                 pn.append(iter(ps[-1]).__next__)
 928                 vs.append(data[1])
 929                 vn.append(iter(vs[-1]).__next__)
 930
 931             pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
 932             try:
 933                 ps_cur = [pn[i]() for i in range(len(pn))]
 934                 vs_cur = [vn[i]() for i in range(len(pn))]
 935
 936                 while True:
 937                     # get the lowest position
 938                     lowest_p = min(ps_cur)
 939
 940                     # at least one lowest position, could be multiple
 941                     locations = [i for i in range(len(ps_cur)) if ps_cur[i] == lowest_p]
 942
 943                     # add the data until the interval
 944                     ret.add_loc(chrom, pre_p, ps_cur[locations[0]], f(vs_cur))
 945
 946                     pre_p = ps_cur[locations[0]]
 947                     for index in locations:
 948                         ps_cur[index] = pn[index]()
 949                         vs_cur[index] = vn[index]()
 950             except StopIteration:
 951                 # meet the end of either bedGraphTrackI, simply exit
 952                 pass
 953         return ret
 954
 955     @cython.ccall
 956     def apply_func(self, func) -> bool:
 957         """Apply function 'func' to every value in this bedGraphTrackI object.
 958
 959         *Two adjacent regions with same value after applying func will
 960         not be merged.
 961         """
 962         i: cython.int
 963
 964         for (p, s) in self.__data.values():
 965             for i in range(len(s)):
 966                 s[i] = func(s[i])
 967         self.maxvalue = func(self.maxvalue)
 968         self.minvalue = func(self.minvalue)
 969         return True
 970
 971     @cython.ccall
 972     def p2q(self):
 973         """Convert pvalue scores to qvalue scores.
 974
 975         *Assume scores in this bedGraph are pvalue scores! Not work
 976          for other type of scores.
 977         """
 978         chrom: bytes
 979         pos_array: pyarray
 980         pscore_array: pyarray
 981         pvalue_stat: dict = {}
 982         pqtable: dict = {}
 983         pre_p: cython.long
 984         this_p: cython.long
 985         # pre_l: cython.long
 986         # l: cython.long
 987         i: cython.long
 988         nhcal: cython.long = 0
 989         N: cython.long
 990         k: cython.long
 991         this_l: cython.long
 992         this_v: cython.float
 993         # pre_v: cython.float
 994         v: cython.float
 995         q: cython.float
 996         pre_q: cython.float
 997         f: cython.float
 998         unique_values: list
 999
1000         # calculate frequencies of each p-score
1001         for chrom in sorted(self.get_chr_names()):
1002             pre_p = 0
1003
1004             [pos_array, pscore_array] = self.__data[chrom]
1005
1006             pn = iter(pos_array).__next__
1007             vn = iter(pscore_array).__next__
1008
1009             for i in range(len(pos_array)):
1010                 this_p = pn()
1011                 this_v = vn()
1012                 this_l = this_p - pre_p
1013                 if this_v in pvalue_stat:
1014                     pvalue_stat[this_v] += this_l
1015                 else:
1016                     pvalue_stat[this_v] = this_l
1017                 pre_p = this_p
1018
1019             # nhcal += len(pos_array)
1020
1021         # nhval = 0
1022
1023         N = sum(pvalue_stat.values())  # total length
1024         k = 1                          # rank
1025         f = -log10(N)
1026         # pre_v = -2147483647
1027         # pre_l = 0
1028         pre_q = 2147483647              # save the previous q-value
1029
1030         # calculate qscore for each pscore
1031         pqtable = {}
1032         unique_values = sorted(pvalue_stat.keys(), reverse=True)
1033         for i in range(len(unique_values)):
1034             v = unique_values[i]
1035             # l = pvalue_stat[v]
1036             q = v + (log10(k) + f)
1037             q = max(0, min(pre_q, q))           # make q-score monotonic
1038             pqtable[v] = q
1039             # pre_v = v
1040             pre_q = q
1041             # k += l
1042             nhcal += 1
1043
1044         # convert pscore to qscore
1045         for chrom in sorted(self.get_chr_names()):
1046             [pos_array, pscore_array] = self.__data[chrom]
1047
1048             for i in range(len(pos_array)):
1049                 pscore_array[i] = pqtable[pscore_array[i]]
1050
1051         self.merge_regions()
1052         return
1053
1054     @cython.ccall
1055     def extract_value(self, bdgTrack2):
1056         """Extract values from regions defined in bedGraphTrackI class object
1057         `bdgTrack2`.
1058
1059         """
1060         pre_p: cython.int
1061         p1: cython.int
1062         p2: cython.int
1063         i: cython.int
1064         v1: cython.float
1065         v2: cython.float
1066         chrom: bytes
1067
1068         assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
1069
1070         # 1: region in bdgTrack2; 2: value; 3: length with the value
1071         ret = [[], pyarray('f', []), pyarray('L', [])]
1072         radd = ret[0].append
1073         vadd = ret[1].append
1074         ladd = ret[2].append
1075
1076         chr1 = set(self.get_chr_names())
1077         chr2 = set(bdgTrack2.get_chr_names())
1078         common_chr = chr1.intersection(chr2)
1079         for i in range(len(common_chr)):
1080             chrom = common_chr.pop()
1081             (p1s, v1s) = self.get_data_by_chr(chrom)  # arrays for position and values
1082             # assign the next function to a viable to speed up
1083             p1n = iter(p1s).__next__
1084             v1n = iter(v1s).__next__
1085
1086             # arrays for position and values
1087             (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
1088             # assign the next function to a viable to speed up
1089             p2n = iter(p2s).__next__
1090             v2n = iter(v2s).__next__
1091             # remember the previous position in the new bedGraphTrackI
1092             # object ret
1093             pre_p = 0
1094             try:
1095                 p1 = p1n()
1096                 v1 = v1n()
1097
1098                 p2 = p2n()
1099                 v2 = v2n()
1100
1101                 while True:
1102                     if p1 < p2:
1103                         # clip a region from pre_p to p1, then set pre_p as p1.
1104                         if v2 > 0:
1105                             radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1106                             vadd(v1)
1107                             ladd(p1-pre_p)
1108                         pre_p = p1
1109                         # call for the next p1 and v1
1110                         p1 = p1n()
1111                         v1 = v1n()
1112                     elif p2 < p1:
1113                         # clip a region from pre_p to p2, then set
1114                         # pre_p as p2.
1115                         if v2 > 0:
1116                             radd(str(chrom)+"."+str(pre_p)+"."+str(p2))
1117                             vadd(v1)
1118                             ladd(p2-pre_p)
1119                         pre_p = p2
1120                         # call for the next p2 and v2
1121                         p2 = p2n()
1122                         v2 = v2n()
1123                     elif p1 == p2:
1124                         # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1125                         if v2 > 0:
1126                             radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1127                             vadd(v1)
1128                             ladd(p1-pre_p)
1129                         pre_p = p1
1130                         # call for the next p1, v1, p2, v2.
1131                         p1 = p1n()
1132                         v1 = v1n()
1133                         p2 = p2n()
1134                         v2 = v2n()
1135             except StopIteration:
1136                 # meet the end of either bedGraphTrackI, simply exit
1137                 pass
1138
1139         return ret
1140
1141     @cython.ccall
1142     def extract_value_hmmr(self, bdgTrack2):
1143         """Extract values from regions defined in bedGraphTrackI class object
1144         `bdgTrack2`.
1145
1146         I will try to tweak this function to output only the values of
1147         bdgTrack1 (self) in the regions in bdgTrack2
1148
1149         This is specifically for HMMRATAC. bdgTrack2 should be a
1150         bedgraph object containing the bins with value set to
1151         'mark_bin' -- the bins in the same region will have the same
1152         value.
1153         """
1154         # pre_p: cython.int
1155         p1: cython.int
1156         p2: cython.int
1157         i: cython.int
1158         v1: cython.float
1159         v2: cython.float
1160         chrom: bytes
1161         ret: list
1162
1163         assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
1164
1165         # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
1166         ret = [[], pyarray('f', []), pyarray('i', [])]
1167         padd = ret[0].append
1168         vadd = ret[1].append
1169         ladd = ret[2].append
1170
1171         chr1 = set(self.get_chr_names())
1172         chr2 = set(bdgTrack2.get_chr_names())
1173         common_chr = sorted(list(chr1.intersection(chr2)))
1174         for i in range(len(common_chr)):
1175             chrom = common_chr.pop()
1176             # arrays for position and values
1177             (p1s, v1s) = self.get_data_by_chr(chrom)
1178             # assign the next function to a viable to speed up
1179             p1n = iter(p1s).__next__
1180             v1n = iter(v1s).__next__
1181
1182             # arrays for position and values
1183             (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
1184             # assign the next function to a viable to speed up
1185             p2n = iter(p2s).__next__
1186             v2n = iter(v2s).__next__
1187             # remember the previous position in the new bedGraphTrackI
1188             # object ret
1189             # pre_p = 0
1190             try:
1191                 p1 = p1n()
1192                 v1 = v1n()
1193
1194                 p2 = p2n()
1195                 v2 = v2n()
1196
1197                 while True:
1198                     if p1 < p2:
1199                         # clip a region from pre_p to p1, then set pre_p as p1.
1200                         # in this case, we don't output any
1201                         # if v2>0:
1202                         #    radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1203                         #    vadd(v1)
1204                         #    ladd(p1-pre_p)
1205                         # pre_p = p1
1206                         # call for the next p1 and v1
1207                         p1 = p1n()
1208                         v1 = v1n()
1209                     elif p2 < p1:
1210                         # clip a region from pre_p to p2, then set pre_p as p2.
1211                         if v2 != 0:  # 0 means it's a gap region, we should have value > 1
1212                             padd((chrom, p2))
1213                             vadd(v1)
1214                             ladd(int(v2))
1215                         # pre_p = p2
1216                         # call for the next p2 and v2
1217                         p2 = p2n()
1218                         v2 = v2n()
1219                     elif p1 == p2:
1220                         # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1221                         if v2 != 0: # 0 means it's a gap region, we should have 1 or -1
1222                             padd((chrom, p2))
1223                             vadd(v1)
1224                             ladd(int(v2))
1225                         # pre_p = p1
1226                         # call for the next p1, v1, p2, v2.
1227                         p1 = p1n()
1228                         v1 = v1n()
1229                         p2 = p2n()
1230                         v2 = v2n()
1231             except StopIteration:
1232                 # meet the end of either bedGraphTrackI, simply exit
1233                 pass
1234
1235         return ret
1236
1237     @cython.ccall
1238     def make_ScoreTrackII_for_macs(self, bdgTrack2,
1239                                    depth1: float = 1.0,
1240                                    depth2: float = 1.0):
1241         """A modified overlie function for MACS v2.
1242
1243         effective_depth_in_million: sequencing depth in million after
1244                                     duplicates being filtered. If
1245                                     treatment is scaled down to
1246                                     control sample size, then this
1247                                     should be control sample size in
1248                                     million. And vice versa.
1249
1250         Return value is a ScoreTrackII object.
1251         """
1252         # pre_p: cython.int
1253         p1: cython.int
1254         p2: cython.int
1255         v1: cython.float
1256         v2: cython.float
1257         chrom: bytes
1258
1259         assert isinstance(bdgTrack2, bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
1260
1261         ret = ScoreTrackII(treat_depth=depth1,
1262                            ctrl_depth=depth2)
1263         retadd = ret.add
1264
1265         chr1 = set(self.get_chr_names())
1266         chr2 = set(bdgTrack2.get_chr_names())
1267         common_chr = chr1.intersection(chr2)
1268         for chrom in sorted(common_chr):
1269             # arrays for position and values
1270             (p1s, v1s) = self.get_data_by_chr(chrom)
1271             # assign the next function to a viable to speed up
1272             p1n = iter(p1s).__next__
1273             v1n = iter(v1s).__next__
1274             # arrays for position and values
1275             (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
1276             # assign the next function to a viable to speed up
1277             p2n = iter(p2s).__next__
1278             v2n = iter(v2s).__next__
1279
1280             # this is the maximum number of locations needed to be
1281             # recorded in scoreTrackI for this chromosome.
1282             chrom_max_len = len(p1s)+len(p2s)
1283
1284             ret.add_chromosome(chrom, chrom_max_len)
1285
1286             # remember the previous position in the new bedGraphTrackI
1287             # object ret
1288             # pre_p = 0
1289
1290             try:
1291                 p1 = p1n()
1292                 v1 = v1n()
1293
1294                 p2 = p2n()
1295                 v2 = v2n()
1296
1297                 while True:
1298                     if p1 < p2:
1299                         # clip a region from pre_p to p1, then set pre_p as p1.
1300                         retadd(chrom, p1, v1, v2)
1301                         # pre_p = p1
1302                         # call for the next p1 and v1
1303                         p1 = p1n()
1304                         v1 = v1n()
1305                     elif p2 < p1:
1306                         # clip a region from pre_p to p2, then set pre_p as p2.
1307                         retadd(chrom, p2, v1, v2)
1308                         # pre_p = p2
1309                         # call for the next p2 and v2
1310                         p2 = p2n()
1311                         v2 = v2n()
1312                     elif p1 == p2:
1313                         # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1314                         retadd(chrom, p1, v1, v2)
1315                         # pre_p = p1
1316                         # call for the next p1, v1, p2, v2.
1317                         p1 = p1n()
1318                         v1 = v1n()
1319                         p2 = p2n()
1320                         v2 = v2n()
1321             except StopIteration:
1322                 # meet the end of either bedGraphTrackI, simply exit
1323                 pass
1324
1325         ret.finalize()
1326         # ret.merge_regions()
1327         return ret
1328
1329     @cython.ccall
1330     def cutoff_analysis(self,
1331                         max_gap: cython.int,
1332                         min_length: cython.int,
1333                         steps: cython.int = 100,
1334                         min_score: cython.float = 0,
1335                         max_score: cython.float = 1000) -> str:
1336         """
1337         Cutoff analysis function for bedGraphTrackI object.
1338
1339         This function will try all possible cutoff values on the score
1340         column to call peaks. Then will give a report of a number of
1341         metrics (number of peaks, total length of peaks, average
1342         length of peak) at varying score cutoffs. For each score
1343         cutoff, the function finds the positions where the score
1344         exceeds the cutoff, then groups those positions into "peaks"
1345         based on the maximum allowed gap (max_gap) between consecutive
1346         positions. If a peak's length exceeds the minimum length
1347         (min_length), the peak is counted.
1348
1349         Parameters
1350         ----------
1351
1352         max_gap : int32_t
1353         Maximum allowed gap between consecutive positions above cutoff
1354
1355         min_length : int32_t Minimum length of peak
1356         steps: int32_t
1357         It will be used to calculate 'step' to increase from min_v to
1358         max_v (see below).
1359
1360         min_score: float32_t
1361         Minimum score for cutoff analysis. Note1: we will take the
1362         larger value between the actual minimum value in the BedGraph
1363         and min_score as min_v. Note2: the min_v won't be included in
1364         the final result. We will try to output the smallest cutoff as
1365         min_v+step.
1366
1367         max_score: float32_t
1368         Maximum score for cutoff analysis. Note1: we will take the
1369         smaller value between the actual maximum value in the BedGraph
1370         and max_score as max_v. Note2: the max_v may not be included
1371         in the final result. We will only output the cutoff that can
1372         generate at least 1 peak.
1373
1374         Returns
1375         -------
1376
1377         Cutoff analysis report in str object.
1378
1379         Todos
1380         -----
1381
1382         May need to separate this function out as a class so that we
1383         can add more ways to analyze the result. Also, we can let this
1384         function return a list of dictionary or data.frame in that
1385         way, instead of str object.
1386         """
1387         chrs: set
1388         peak_content: list
1389         ret_list: list
1390         cutoff_list: list
1391         cutoff_npeaks: list
1392         cutoff_lpeaks: list
1393         chrom: bytes
1394         ret: str
1395         cutoff: cython.float
1396         total_l: cython.long
1397         total_p: cython.long
1398         i: cython.long
1399         n: cython.long
1400         ts: cython.long
1401         te: cython.long
1402         lastp: cython.long
1403         tl: cython.long
1404         peak_length: cython.long
1405         # dict cutoff_npeaks, cutoff_lpeaks
1406         s: cython.float
1407
1408         chrs = self.get_chr_names()
1409
1410         # midvalue = self.minvalue/2 + self.maxvalue/2
1411         # s = float(self.minvalue - midvalue)/steps
1412         minv = max(min_score, self.minvalue)
1413         maxv = min(self.maxvalue, max_score)
1414
1415         s = float(maxv - minv)/steps
1416
1417         # a list of possible cutoff values from minv to maxv with step of s
1418         cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)]
1419
1420         cutoff_npeaks = [0] * len(cutoff_list)
1421         cutoff_lpeaks = [0] * len(cutoff_list)
1422
1423         for chrom in sorted(chrs):
1424             (pos_array, score_array) = self.__data[chrom]
1425             pos_array = np.array(self.__data[chrom][0])
1426             score_array = np.array(self.__data[chrom][1])
1427
1428             for n in range(len(cutoff_list)):
1429                 cutoff = cutoff_list[n]
1430                 total_l = 0           # total length of peaks
1431                 total_p = 0           # total number of peaks
1432
1433                 # get the regions with scores above cutoffs. This is
1434                 # not an optimized method. It would be better to store
1435                 # score array in a 2-D ndarray?
1436                 above_cutoff = np.nonzero(score_array > cutoff)[0]
1437                 # end positions of regions where score is above cutoff
1438                 above_cutoff_endpos = pos_array[above_cutoff]
1439                 # start positions of regions where score is above cutoff
1440                 above_cutoff_startpos = pos_array[above_cutoff-1]
1441
1442                 if above_cutoff_endpos.size == 0:
1443                     continue
1444
1445                 # first bit of region above cutoff
1446                 acs_next = iter(above_cutoff_startpos).__next__
1447                 ace_next = iter(above_cutoff_endpos).__next__
1448
1449                 ts = acs_next()
1450                 te = ace_next()
1451                 peak_content = [(ts, te),]
1452                 lastp = te
1453
1454                 for i in range(1, above_cutoff_startpos.size):
1455                     ts = acs_next()
1456                     te = ace_next()
1457                     tl = ts - lastp
1458                     if tl <= max_gap:
1459                         peak_content.append((ts, te))
1460                     else:
1461                         peak_length = peak_content[-1][1] - peak_content[0][0]
1462                         # if the peak is too small, reject it
1463                         if peak_length >= min_length:
1464                             total_l += peak_length
1465                             total_p += 1
1466                         peak_content = [(ts, te),]
1467                     lastp = te
1468
1469                 if peak_content:
1470                     peak_length = peak_content[-1][1] - peak_content[0][0]
1471                     # if the peak is too small, reject it
1472                     if peak_length >= min_length:
1473                         total_l += peak_length
1474                         total_p += 1
1475                 cutoff_lpeaks[n] += total_l
1476                 cutoff_npeaks[n] += total_p
1477
1478         # prepare the returnning text
1479         ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"]
1480         for n in range(len(cutoff_list)-1, -1, -1):
1481             cutoff = cutoff_list[n]
1482             if cutoff_npeaks[n] > 0:
1483                 ret_list.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff,
1484                                                           cutoff_npeaks[n],
1485                                                           cutoff_lpeaks[n],
1486                                                           cutoff_lpeaks[n]/cutoff_npeaks[n]))
1487         ret = ''.join(ret_list)
1488         return ret
1489
1490
1491 @cython.cfunc
1492 def calculate_elbows(values: cnp.ndarray,
1493                      threshold: cython.float = 0.01) -> cnp.ndarray:
1494     # although this function is supposed to find elbow pts for cutoff
1495     # analysis, however, in reality, it barely works...
1496     deltas: cnp.ndarray
1497     slopes: cnp.ndarray
1498     delta_slopes: cnp.ndarray
1499     elbows: cnp.ndarray
1500     avg_delta_slope: cython.float
1501
1502     # Calculate the difference between each point and the first point
1503     deltas = values - values[0]
1504     # Calculate the slope between each point and the last point
1505     slopes = deltas / (values[-1] - values[0])
1506     # Calculate the change in slope
1507     delta_slopes = np.diff(slopes)
1508     # Calculate the average change in slope
1509     avg_delta_slope = np.mean(delta_slopes)
1510     # Find all points where the change in slope is significantly
1511     # larger than the average
1512     elbows = np.where(delta_slopes > avg_delta_slope + threshold)[0]
1513     return elbows