implement PETrackII for fragment files from scATAC experiments
[MACS.git] / MACS3 / Signal / BedGraph.py
blob2abc617540fcde8f70628462f297820787cf41a1
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2024-10-14 19:32:34 Tao Liu>
5 """Module for BedGraph data class.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
9 the distribution).
10 """
12 # ------------------------------------
13 # python modules
14 # ------------------------------------
15 import cython
16 from array import array as pyarray
17 from math import prod
18 # ------------------------------------
19 # MACS3 modules
20 # ------------------------------------
21 from MACS3.Signal.ScoreTrack import ScoreTrackII
22 from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, PeakContent
23 from MACS3.Signal.Prob import chisq_logp_e
25 # ------------------------------------
26 # Other modules
27 # ------------------------------------
29 from cython.cimports.cpython import bool
30 import numpy as np
31 import cython.cimports.numpy as cnp
33 # ------------------------------------
34 # C lib
35 # ------------------------------------
37 from cython.cimports.libc.math import sqrt, log10
39 # ------------------------------------
40 # constants
41 # ------------------------------------
42 LOG10_E = 0.43429448190325176
44 # ------------------------------------
45 # Misc functions
46 # ------------------------------------
49 @cython.inline
50 @cython.cfunc
51 def mean_func(x):
52 return sum(x)/len(x)
55 @cython.inline
56 @cython.cfunc
57 def fisher_func(x):
58 # combine -log10pvalues
59 return chisq_logp_e(2*sum(x)/LOG10_E, 2*len(x), log10=True)
62 @cython.inline
63 @cython.cfunc
64 def subtract_func(x):
65 # subtraction of two items list
66 return x[1] - x[0]
69 @cython.inline
70 @cython.cfunc
71 def divide_func(x):
72 # division of two items list
73 return x[1] / x[2]
76 @cython.inline
77 @cython.cfunc
78 def product_func(x):
79 # production of a list of values
80 # only python 3.8 or above
81 return prod(x)
83 # ------------------------------------
84 # Classes
85 # ------------------------------------
88 @cython.cclass
89 class bedGraphTrackI:
90 """Class for bedGraph type data.
92 In bedGraph, data are represented as continuous non-overlapping
93 regions in the whole genome. I keep this assumption in all the
94 functions. If data has overlaps, some functions will definitely
95 give incorrect results.
97 1. Continuous: the next region should be after the previous one
98 unless they are on different chromosomes;
100 2. Non-overlapping: the next region should never have overlaps
101 with preceding region.
103 The way to memorize bedGraph data is to remember the transition
104 points together with values of their preceding regions. The last
105 data point may exceed chromosome end, unless a chromosome
106 dictionary is given. Remember the coordinations in bedGraph and
107 this class is 0-indexed and right-open.
110 __data: dict
111 maxvalue = cython.declare(cython.float, visibility="public")
112 minvalue = cython.declare(cython.float, visibility="public")
113 baseline_value = cython.declare(cython.float, visibility="public")
115 def __init__(self, baseline_value: cython.float = 0):
117 baseline_value is the value to fill in the regions not defined
118 in bedGraph. For example, if the bedGraph is like:
120 chr1 100 200 1
121 chr1 250 350 2
123 Then the region chr1:200..250 should be filled with baseline_value.
126 self.__data = {}
127 self.maxvalue = -10000000 # initial maximum value is tiny since I want safe_add_loc to update it
128 self.minvalue = 10000000 # initial minimum value is large since I want safe_add_loc to update it
129 self.baseline_value = baseline_value
131 @cython.ccall
132 def add_loc(self, chromosome: bytes,
133 startpos: cython.int,
134 endpos: cython.int,
135 value: cython.float):
136 """Add a chr-start-end-value block into __data dictionary.
138 Note, we don't check if the add_loc is called continuously on
139 sorted regions without any gap. So we only suggest calling
140 this function within MACS.
143 pre_v: cython.float
145 # basic assumption, end pos should > start pos
147 if endpos <= 0:
148 return
149 if startpos < 0:
150 startpos = 0
152 if chromosome not in self.__data:
153 self.__data[chromosome] = [pyarray('i', []),
154 pyarray('f', [])]
155 c = self.__data[chromosome]
156 if startpos:
157 # start pos is not 0, then add two blocks, the first
158 # with "baseline_value"; the second with "value"
159 c[0].append(startpos)
160 c[1].append(self.baseline_value)
161 c[0].append(endpos)
162 c[1].append(value)
163 else:
164 c = self.__data[chromosome]
165 # get the preceding region
166 pre_v = c[1][-1]
168 # if this region is next to the previous one.
169 if pre_v == value:
170 # if value is the same, simply extend it.
171 c[0][-1] = endpos
172 else:
173 # otherwise, add a new region
174 c[0].append(endpos)
175 c[1].append(value)
177 if value > self.maxvalue:
178 self.maxvalue = value
179 if value < self.minvalue:
180 self.minvalue = value
182 @cython.ccall
183 def add_loc_wo_merge(self, chromosome: bytes,
184 startpos: cython.int,
185 endpos: cython.int,
186 value: cython.float):
187 """Add a chr-start-end-value block into __data dictionary.
189 Note, we don't check if the add_loc is called continuously on
190 sorted regions without any gap. So we only suggest calling
191 this function within MACS.
193 This one won't merge nearby ranges with the same value
195 if endpos <= 0:
196 return
197 if startpos < 0:
198 startpos = 0
200 if value < self.baseline_value:
201 value = self.baseline_value
203 if chromosome not in self.__data:
204 self.__data[chromosome] = [pyarray('i', []),
205 pyarray('f', [])]
206 c = self.__data[chromosome]
207 if startpos:
208 # start pos is not 0, then add two blocks, the first
209 # with "baseline_value"; the second with "value"
210 c[0].append(startpos)
211 c[1].append(self.baseline_value)
212 c = self.__data[chromosome]
213 c[0].append(endpos)
214 c[1].append(value)
215 if value > self.maxvalue:
216 self.maxvalue = value
217 if value < self.minvalue:
218 self.minvalue = value
220 @cython.ccall
221 def add_chrom_data(self,
222 chromosome: bytes,
223 p: pyarray,
224 v: pyarray):
225 """Add a pv data to a chromosome. Replace the previous data.
227 p: a pyarray object 'i' for positions
228 v: a pyarray object 'f' for values
230 Note: no checks for error, use with caution
232 maxv: cython.float
233 minv: cython.float
235 self.__data[chromosome] = [p, v]
236 maxv = max(v)
237 minv = min(v)
238 if maxv > self.maxvalue:
239 self.maxvalue = maxv
240 if minv < self.minvalue:
241 self.minvalue = minv
242 return
244 @cython.ccall
245 def add_chrom_data_PV(self,
246 chromosome: bytes,
247 pv: cnp.ndarray):
248 """Add a pv data to a chromosome. Replace the previous data.
250 This is a kinda silly function to waste time and convert a PV
251 array (2-d named numpy array) into two python arrays for this
252 BedGraph class. May have better function later.
254 Note: no checks for error, use with caution
256 maxv: cython.float
257 minv: cython.float
259 self.__data[chromosome] = [pyarray('i', pv['p']),
260 pyarray('f', pv['v'])]
261 minv = pv['v'].min()
262 maxv = pv['p'].max()
263 if maxv > self.maxvalue:
264 self.maxvalue = maxv
265 if minv < self.minvalue:
266 self.minvalue = minv
267 return
269 @cython.ccall
270 def destroy(self) -> bool:
271 """ destroy content, free memory.
273 chrs: set
274 chrom: bytes
276 chrs = self.get_chr_names()
277 for chrom in sorted(chrs):
278 if chrom in self.__data:
279 self.__data[chrom] = [None, None]
280 self.__data.pop(chrom)
281 return True
283 @cython.ccall
284 def get_data_by_chr(self, chromosome: bytes) -> list:
285 """Return array of counts by chromosome.
287 The return value is a tuple:
288 ([end pos],[value])
290 if chromosome in self.__data:
291 return self.__data[chromosome]
292 else:
293 return []
295 @cython.ccall
296 def get_chr_names(self) -> set:
297 """Return all the chromosome names stored.
300 return set(sorted(self.__data.keys()))
302 @cython.ccall
303 def reset_baseline(self, baseline_value: cython.float):
304 """Reset baseline value to baseline_value.
306 So any region between self.baseline_value and baseline_value
307 will be set to baseline_value.
310 self.baseline_value = baseline_value
311 self.filter_score(cutoff=baseline_value)
312 self.merge_regions()
313 return
315 @cython.cfunc
316 def merge_regions(self):
317 """Merge nearby regions with the same value.
320 # new_pre_pos: cython.int
321 pos: cython.int
322 i: cython.int
323 new_pre_value: cython.float
324 value: cython.float
325 chrom: bytes
326 chrs: set
328 chrs = self.get_chr_names()
329 for chrom in sorted(chrs):
330 (p, v) = self.__data[chrom]
331 pnext = iter(p).__next__
332 vnext = iter(v).__next__
334 # new arrays
335 new_pos = pyarray('L', [pnext(),])
336 new_value = pyarray('f', [vnext(),])
338 newpa = new_pos.append
339 newva = new_value.append
341 # new_pre_pos = new_pos[0]
342 new_pre_value = new_value[0]
344 for i in range(1, len(p)):
345 pos = pnext()
346 value = vnext()
347 if value == new_pre_value:
348 new_pos[-1] = pos
349 else:
350 # add new region
351 newpa(pos)
352 newva(value)
353 # new_pre_pos = pos
354 new_pre_value = value
355 self.__data[chrom] = [new_pos, new_value]
356 return True
358 @cython.ccall
359 def filter_score(self, cutoff: cython.float = 0) -> bool:
360 """Filter using a score cutoff. Any region lower than score
361 cutoff will be set to self.baseline_value.
363 Self will be modified.
365 # new_pre_pos: cython.int
366 pos: cython.int
367 i: cython.int
368 new_pre_value: cython.float
369 value: cython.float
370 chrom: bytes
371 chrs: set
373 chrs = self.get_chr_names()
374 for chrom in sorted(chrs):
375 (p, v) = self.__data[chrom]
376 pnext = iter(p).__next__
377 vnext = iter(v).__next__
379 # new arrays
380 new_pos = pyarray('L', [])
381 new_value = pyarray('f', [])
382 # new_pre_pos = 0
383 new_pre_value = 0
385 for i in range(len(p)):
386 pos = pnext()
387 value = vnext()
389 if value < cutoff:
390 # this region will be set to baseline_value
391 if new_pre_value == self.baseline_value:
392 # if preceding region is at baseline, extend it
393 new_pos[-1] = pos
394 else:
395 # else add a new baseline region
396 new_pos.append(pos)
397 new_value.append(self.baseline_value)
398 else:
399 # put it into new arrays
400 new_pos.append(pos)
401 new_value.append(value)
402 # new_pre_pos = new_pos[-1]
403 new_pre_value = new_value[-1]
404 self.__data[chrom] = [new_pos, new_value]
405 return True
407 @cython.ccall
408 def summary(self) -> tuple:
409 """Calculate the sum, total_length, max, min, mean, and std.
411 Return a tuple for (sum, total_length, max, min, mean, std).
414 n_v: cython.long
415 sum_v: cython.float
416 max_v: cython.float
417 min_v: cython.float
418 mean_v: cython.float
419 variance: cython.float
420 tmp: cython.float
421 std_v: cython.float
422 pre_p: cython.int
423 ln: cython.int
424 i: cython.int
426 pre_p = 0
427 n_v = 0
428 sum_v = 0
429 max_v = -100000
430 min_v = 100000
431 for (p, v) in self.__data.values():
432 # for each chromosome
433 pre_p = 0
434 for i in range(len(p)):
435 # for each region
436 ln = p[i]-pre_p
437 sum_v += v[i]*ln
438 n_v += ln
439 pre_p = p[i]
440 max_v = max(max(v), max_v)
441 min_v = min(min(v), min_v)
442 mean_v = sum_v/n_v
443 variance = 0.0
444 for (p, v) in self.__data.values():
445 for i in range(len(p)):
446 # for each region
447 tmp = v[i]-mean_v
448 ln = p[i]-pre_p
449 variance += tmp*tmp*ln
450 pre_p = p[i]
452 variance /= float(n_v-1)
453 std_v = sqrt(variance)
454 return (sum_v, n_v, max_v, min_v, mean_v, std_v)
456 @cython.ccall
457 def call_peaks(self,
458 cutoff: cython.float = 1,
459 min_length: cython.int = 200,
460 max_gap: cython.int = 50,
461 call_summits: bool = False):
462 """This function try to find regions within which, scores
463 are continuously higher than a given cutoff.
465 This function is NOT using sliding-windows. Instead, any
466 regions in bedGraph above certain cutoff will be detected,
467 then merged if the gap between nearby two regions are below
468 max_gap. After this, peak is reported if its length is above
469 min_length.
471 cutoff: cutoff of value, default 1.
472 min_length : minimum peak length, default 200.
473 gap : maximum gap to merge nearby peaks, default 50.
475 Removed option:
477 up_limit: the highest acceptable value. Default 10^{310}
478 * so only allow peak with value >=cutoff and <=up_limit
480 This does not work. The region above upper limit may still be
481 included as `gap` .
484 # peak_length: cython.int
485 x: cython.int
486 pre_p: cython.int
487 p: cython.int
488 i: cython.int
489 v: cython.float
490 chrom: bytes
491 chrs: set
493 chrs = self.get_chr_names()
494 peaks = PeakIO() # dictionary to save peaks
495 for chrom in sorted(chrs):
496 peak_content = None
497 # peak_length = 0
498 (ps, vs) = self.get_data_by_chr(chrom) # arrays for position and values
499 psn = iter(ps).__next__ # assign the next function to a viable to speed up
500 vsn = iter(vs).__next__
501 x = 0
502 pre_p = 0 # remember previous position
503 while True:
504 # find the first region above cutoff
505 try: # try to read the first data range for this chrom
506 p = psn()
507 v = vsn()
508 except Exception:
509 break
510 x += 1 # index for the next point
511 if v >= cutoff:
512 peak_content = [(pre_p, p, v),]
513 pre_p = p
514 break # found the first range above cutoff
515 else:
516 pre_p = p
518 for i in range(x, len(ps)):
519 # continue scan the rest regions
520 p = psn()
521 v = vsn()
522 if v < cutoff: # not be detected as 'peak'
523 pre_p = p
524 continue
525 # for points above cutoff
526 # if the gap is allowed
527 if pre_p - peak_content[-1][1] <= max_gap:
528 peak_content.append((pre_p, p, v))
529 else:
530 # when the gap is not allowed, close this peak
531 self.__close_peak(peak_content,
532 peaks,
533 min_length,
534 chrom) # , smoothlen=max_gap / 2)
535 # start a new peak
536 peak_content = [(pre_p, p, v),]
537 pre_p = p
539 # save the last peak
540 if not peak_content:
541 continue
542 self.__close_peak(peak_content,
543 peaks,
544 min_length,
545 chrom) # , smoothlen=max_gap / 2)
546 return peaks
548 @cython.cfunc
549 def __close_peak(self,
550 peak_content: list,
551 peaks,
552 min_length: cython.int,
553 chrom: bytes) -> bool:
554 tsummit: list # list for temporary summits
555 peak_length: cython.int
556 summit: cython.int
557 tstart: cython.int
558 tend: cython.int
559 summit_value: cython.float
560 tvalue: cython.float
561 peak_length = peak_content[-1][1]-peak_content[0][0]
562 if peak_length >= min_length: # if the peak is too small, reject it
563 tsummit = []
564 summit = 0
565 summit_value = 0
566 for (tstart, tend, tvalue) in peak_content:
567 if not summit_value or summit_value < tvalue:
568 tsummit = [cython.cast(cython.int, (tend+tstart)/2),]
569 summit_value = tvalue
570 elif summit_value == tvalue:
571 tsummit.append(cython.cast(cython.int, (tend+tstart)/2))
572 summit = tsummit[cython.cast(cython.int, (len(tsummit)+1)/2)-1]
573 peaks.add(chrom,
574 peak_content[0][0],
575 peak_content[-1][1],
576 summit=summit,
577 peak_score=summit_value,
578 pileup=0,
579 pscore=0,
580 fold_change=0,
581 qscore=0
583 return True
585 @cython.ccall
586 def call_broadpeaks(self,
587 lvl1_cutoff: cython.float = 500,
588 lvl2_cutoff: cython.float = 100,
589 min_length: cython.int = 200,
590 lvl1_max_gap: cython.int = 50,
591 lvl2_max_gap: cython.int = 400):
592 """This function try to find enriched regions within which,
593 scores are continuously higher than a given cutoff for level
594 1, and link them using the gap above level 2 cutoff with a
595 maximum length of lvl2_max_gap.
597 lvl1_cutoff: cutoff of value at enriched regions, default 500.
598 lvl2_cutoff: cutoff of value at linkage regions, default 100.
599 min_length : minimum peak length, default 200.
600 lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50.
601 lvl2_max_gap : maximum length of linkage regions, default 400.
602 colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column.
604 Return both general PeakIO object for highly enriched regions
605 and gapped broad regions in BroadPeakIO.
607 chrom: bytes
608 i: cython.int
609 j: cython.int
610 chrs: set
611 lvl1: PeakContent
612 lvl2: PeakContent # PeakContent class object
613 lvl1peakschrom: list
614 lvl2peakschrom: list
616 assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
617 assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
618 lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff,
619 min_length=min_length,
620 max_gap=lvl1_max_gap,
621 call_summits=False)
622 lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff,
623 min_length=min_length,
624 max_gap=lvl2_max_gap,
625 call_summits=False)
626 chrs = lvl1_peaks.get_chr_names()
627 broadpeaks = BroadPeakIO()
628 # use lvl2_peaks as linking regions between lvl1_peaks
629 for chrom in sorted(chrs):
630 lvl1peakschrom = lvl1_peaks.get_data_from_chrom(chrom)
631 lvl2peakschrom = lvl2_peaks.get_data_from_chrom(chrom)
632 lvl1peakschrom_next = iter(lvl1peakschrom).__next__
633 tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region
634 # our assumption is lvl1 regions should be included in lvl2 regions
635 try:
636 lvl1 = lvl1peakschrom_next()
637 for i in range(len(lvl2peakschrom)):
638 # for each lvl2 peak, find all lvl1 peaks inside
639 lvl2 = lvl2peakschrom[i]
640 while True:
641 if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
642 tmppeakset.append(lvl1)
643 lvl1 = lvl1peakschrom_next()
644 else:
645 self.__add_broadpeak(broadpeaks,
646 chrom,
647 lvl2,
648 tmppeakset)
649 tmppeakset = []
650 break
651 except StopIteration:
652 self.__add_broadpeak(broadpeaks, chrom, lvl2, tmppeakset)
653 tmppeakset = []
654 for j in range(i+1, len(lvl2peakschrom)):
655 self.__add_broadpeak(broadpeaks,
656 chrom,
657 lvl2peakschrom[j],
658 tmppeakset)
659 return broadpeaks
661 @cython.cfunc
662 def __add_broadpeak(self,
663 bpeaks,
664 chrom: bytes,
665 lvl2peak: PeakContent,
666 lvl1peakset: list):
667 """Internal function to create broad peak.
669 start: cython.int
670 end: cython.int
671 blockNum: cython.int
672 blockSizes: bytes
673 blockStarts: bytes
674 thickStart: bytes
675 thickEnd: bytes
677 start = lvl2peak["start"]
678 end = lvl2peak["end"]
680 # the following code will add those broad/lvl2 peaks with no
681 # strong/lvl1 peaks inside
682 if not lvl1peakset:
683 # try:
684 # will complement by adding 1bps start and end to this region
685 # may change in the future if gappedPeak format was improved.
686 bpeaks.add(chrom, start, end,
687 score=lvl2peak["score"],
688 thickStart=(b"%d" % start),
689 thickEnd=(b"%d" % end),
690 blockNum=2,
691 blockSizes=b"1,1",
692 blockStarts=(b"0,%d" % (end-start-1)),
693 pileup=lvl2peak["pileup"],
694 pscore=lvl2peak["pscore"],
695 fold_change=lvl2peak["fc"],
696 qscore=lvl2peak["qscore"])
697 return bpeaks
699 thickStart = b"%d" % lvl1peakset[0]["start"]
700 thickEnd = b"%d" % lvl1peakset[-1]["end"]
701 blockNum = len(lvl1peakset)
702 blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset])
703 blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset])
705 if int(thickStart) != start:
706 # add 1bp left block
707 thickStart = b"%d" % start
708 blockNum += 1
709 blockSizes = b"1,"+blockSizes
710 blockStarts = b"0,"+blockStarts
711 if int(thickEnd) != end:
712 # add 1bp right block
713 thickEnd = b"%d" % end
714 blockNum += 1
715 blockSizes = blockSizes+b",1"
716 blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
718 bpeaks.add(chrom, start, end,
719 score=lvl2peak["score"],
720 thickStart=thickStart,
721 thickEnd=thickEnd,
722 blockNum=blockNum,
723 blockSizes=blockSizes,
724 blockStarts=blockStarts,
725 pileup=lvl2peak["pileup"],
726 pscore=lvl2peak["pscore"],
727 fold_change=lvl2peak["fc"],
728 qscore=lvl2peak["qscore"])
729 return bpeaks
731 @cython.ccall
732 def refine_peaks(self, peaks):
733 """This function try to based on given peaks, re-evaluate the
734 peak region, call the summit.
736 peaks: PeakIO object
737 return: a new PeakIO object
740 pre_p: cython.int
741 p: cython.int
742 peak_s: cython.int
743 peak_e: cython.int
744 v: cython.float
745 chrom: bytes
746 chrs: set
748 peaks.sort()
749 new_peaks = PeakIO()
750 chrs = self.get_chr_names()
751 assert isinstance(peaks, PeakIO)
752 chrs = chrs.intersection(set(peaks.get_chr_names()))
754 for chrom in sorted(chrs):
755 peaks_chr = peaks.get_data_from_chrom(chrom)
756 peak_content = []
757 # arrays for position and values
758 (ps, vs) = self.get_data_by_chr(chrom)
759 # assign the next function to a viable to speed up
760 psn = iter(ps).__next__
761 vsn = iter(vs).__next__
762 peakn = iter(peaks_chr).__next__
764 # remember previous position in bedgraph/self
765 pre_p = 0
766 p = psn()
767 v = vsn()
768 peak = peakn()
769 peak_s = peak["start"]
770 peak_e = peak["end"]
771 while True:
772 # look for overlap
773 if p > peak_s and peak_e > pre_p:
774 # now put four coordinates together and pick the middle two
775 s, e = sorted([p, peak_s, peak_e, pre_p])[1:3]
776 # add this content
777 peak_content.append((s, e, v))
778 # move self/bedGraph
779 try:
780 pre_p = p
781 p = psn()
782 v = vsn()
783 except Exception:
784 # no more value chunk in bedGraph
785 break
786 elif pre_p >= peak_e:
787 # close peak
788 self.__close_peak(peak_content, new_peaks, 0, chrom)
789 peak_content = []
790 # move peak
791 try:
792 peak = peakn()
793 peak_s = peak["start"]
794 peak_e = peak["end"]
795 except Exception:
796 # no more peak
797 break
798 elif peak_s >= p:
799 # move self/bedgraph
800 try:
801 pre_p = p
802 p = psn()
803 v = vsn()
804 except Exception:
805 # no more value chunk in bedGraph
806 break
807 else:
808 raise Exception(f"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}")
810 # save the last peak
811 if peak_content:
812 self.__close_peak(peak_content, new_peaks, 0, chrom)
813 return new_peaks
815 @cython.ccall
816 def total(self) -> cython.int:
817 """Return the number of regions in this object.
820 t: cython.int
821 t = 0
822 for (p, v) in self.__data.values():
823 t += len(p)
824 return t
826 @cython.ccall
827 def set_single_value(self, new_value: cython.float):
828 """Change all the values in bedGraph to the same new_value,
829 return a new bedGraphTrackI.
832 chrom: bytes
833 max_p: cython.int
835 ret = bedGraphTrackI()
836 chroms = set(self.get_chr_names())
837 for chrom in sorted(chroms):
838 # arrays for position and values
839 (p1, v1) = self.get_data_by_chr(chrom)
840 # maximum p
841 max_p = max(p1)
842 # add a region from 0 to max_p
843 ret.add_loc(chrom, 0, max_p, new_value)
844 return ret
846 @cython.ccall
847 def overlie(self, bdgTracks, func: str = "max"):
848 """Calculate two or more bedGraphTrackI objects by letting self
849 overlying bdgTrack2, with user-defined functions.
851 Transition positions from both bedGraphTrackI objects will be
852 considered and combined. For example:
854 #1 bedGraph (self) | #2 bedGraph
855 -----------------------------------------------
856 chr1 0 100 0 | chr1 0 150 1
857 chr1 100 200 3 | chr1 150 250 2
858 chr1 200 300 4 | chr1 250 300 4
860 these two bedGraphs will be combined to have five transition
861 points: 100, 150, 200, 250, and 300. So in order to calculate
862 two bedGraphs, I pair values within the following regions
863 like:
865 chr s e (#1,#2) applied_func_max
866 -----------------------------------------------
867 chr1 0 100 (0,1) 1
868 chr1 100 150 (3,1) 3
869 chr1 150 200 (3,2) 3
870 chr1 200 250 (4,2) 4
871 chr1 250 300 (4,4) 4
873 Then the given 'func' will be applied on each 2-tuple as func(#1,#2)
875 Supported 'func' are "sum", "subtract" (only for two bdg
876 objects), "product", "divide" (only for two bdg objects),
877 "max", "mean" and "fisher".
879 Return value is a new bedGraphTrackI object.
881 Option: bdgTracks can be a list of bedGraphTrackI objects
883 pre_p: cython.int
884 chrom: bytes
886 nr_tracks = len(bdgTracks) + 1 # +1 for self
887 assert nr_tracks >= 2, "Specify at least one more bdg objects."
888 for i, bdgTrack in enumerate(bdgTracks):
889 assert isinstance(bdgTrack, bedGraphTrackI), "bdgTrack{} is not a bedGraphTrackI object".format(i + 1)
891 if func == "max":
892 f = max
893 elif func == "mean":
894 f = mean_func
895 elif func == "fisher":
896 f = fisher_func
897 elif func == "sum":
898 f = sum
899 elif func == "product":
900 f = product_func
901 elif func == "subtract":
902 if nr_tracks == 2:
903 f = subtract_func
904 else:
905 raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
906 elif func == "divide":
907 if nr_tracks == 2:
908 f = divide_func
909 else:
910 raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
911 else:
912 raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
914 ret = bedGraphTrackI()
916 common_chr = set(self.get_chr_names())
917 for track in bdgTracks:
918 common_chr = common_chr.intersection(set(track.get_chr_names()))
920 for chrom in sorted(common_chr):
921 datas = [self.get_data_by_chr(chrom)]
922 datas.extend([bdgTracks[i].get_data_by_chr(chrom) for i in range(len(bdgTracks))])
924 ps, vs, pn, vn = [], [], [], []
925 for data in datas:
926 ps.append(data[0])
927 pn.append(iter(ps[-1]).__next__)
928 vs.append(data[1])
929 vn.append(iter(vs[-1]).__next__)
931 pre_p = 0 # remember the previous position in the new bedGraphTrackI object ret
932 try:
933 ps_cur = [pn[i]() for i in range(len(pn))]
934 vs_cur = [vn[i]() for i in range(len(pn))]
936 while True:
937 # get the lowest position
938 lowest_p = min(ps_cur)
940 # at least one lowest position, could be multiple
941 locations = [i for i in range(len(ps_cur)) if ps_cur[i] == lowest_p]
943 # add the data until the interval
944 ret.add_loc(chrom, pre_p, ps_cur[locations[0]], f(vs_cur))
946 pre_p = ps_cur[locations[0]]
947 for index in locations:
948 ps_cur[index] = pn[index]()
949 vs_cur[index] = vn[index]()
950 except StopIteration:
951 # meet the end of either bedGraphTrackI, simply exit
952 pass
953 return ret
955 @cython.ccall
956 def apply_func(self, func) -> bool:
957 """Apply function 'func' to every value in this bedGraphTrackI object.
959 *Two adjacent regions with same value after applying func will
960 not be merged.
962 i: cython.int
964 for (p, s) in self.__data.values():
965 for i in range(len(s)):
966 s[i] = func(s[i])
967 self.maxvalue = func(self.maxvalue)
968 self.minvalue = func(self.minvalue)
969 return True
971 @cython.ccall
972 def p2q(self):
973 """Convert pvalue scores to qvalue scores.
975 *Assume scores in this bedGraph are pvalue scores! Not work
976 for other type of scores.
978 chrom: bytes
979 pos_array: pyarray
980 pscore_array: pyarray
981 pvalue_stat: dict = {}
982 pqtable: dict = {}
983 pre_p: cython.long
984 this_p: cython.long
985 # pre_l: cython.long
986 # l: cython.long
987 i: cython.long
988 nhcal: cython.long = 0
989 N: cython.long
990 k: cython.long
991 this_l: cython.long
992 this_v: cython.float
993 # pre_v: cython.float
994 v: cython.float
995 q: cython.float
996 pre_q: cython.float
997 f: cython.float
998 unique_values: list
1000 # calculate frequencies of each p-score
1001 for chrom in sorted(self.get_chr_names()):
1002 pre_p = 0
1004 [pos_array, pscore_array] = self.__data[chrom]
1006 pn = iter(pos_array).__next__
1007 vn = iter(pscore_array).__next__
1009 for i in range(len(pos_array)):
1010 this_p = pn()
1011 this_v = vn()
1012 this_l = this_p - pre_p
1013 if this_v in pvalue_stat:
1014 pvalue_stat[this_v] += this_l
1015 else:
1016 pvalue_stat[this_v] = this_l
1017 pre_p = this_p
1019 # nhcal += len(pos_array)
1021 # nhval = 0
1023 N = sum(pvalue_stat.values()) # total length
1024 k = 1 # rank
1025 f = -log10(N)
1026 # pre_v = -2147483647
1027 # pre_l = 0
1028 pre_q = 2147483647 # save the previous q-value
1030 # calculate qscore for each pscore
1031 pqtable = {}
1032 unique_values = sorted(pvalue_stat.keys(), reverse=True)
1033 for i in range(len(unique_values)):
1034 v = unique_values[i]
1035 # l = pvalue_stat[v]
1036 q = v + (log10(k) + f)
1037 q = max(0, min(pre_q, q)) # make q-score monotonic
1038 pqtable[v] = q
1039 # pre_v = v
1040 pre_q = q
1041 # k += l
1042 nhcal += 1
1044 # convert pscore to qscore
1045 for chrom in sorted(self.get_chr_names()):
1046 [pos_array, pscore_array] = self.__data[chrom]
1048 for i in range(len(pos_array)):
1049 pscore_array[i] = pqtable[pscore_array[i]]
1051 self.merge_regions()
1052 return
1054 @cython.ccall
1055 def extract_value(self, bdgTrack2):
1056 """Extract values from regions defined in bedGraphTrackI class object
1057 `bdgTrack2`.
1060 pre_p: cython.int
1061 p1: cython.int
1062 p2: cython.int
1063 i: cython.int
1064 v1: cython.float
1065 v2: cython.float
1066 chrom: bytes
1068 assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
1070 # 1: region in bdgTrack2; 2: value; 3: length with the value
1071 ret = [[], pyarray('f', []), pyarray('L', [])]
1072 radd = ret[0].append
1073 vadd = ret[1].append
1074 ladd = ret[2].append
1076 chr1 = set(self.get_chr_names())
1077 chr2 = set(bdgTrack2.get_chr_names())
1078 common_chr = chr1.intersection(chr2)
1079 for i in range(len(common_chr)):
1080 chrom = common_chr.pop()
1081 (p1s, v1s) = self.get_data_by_chr(chrom) # arrays for position and values
1082 # assign the next function to a viable to speed up
1083 p1n = iter(p1s).__next__
1084 v1n = iter(v1s).__next__
1086 # arrays for position and values
1087 (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
1088 # assign the next function to a viable to speed up
1089 p2n = iter(p2s).__next__
1090 v2n = iter(v2s).__next__
1091 # remember the previous position in the new bedGraphTrackI
1092 # object ret
1093 pre_p = 0
1094 try:
1095 p1 = p1n()
1096 v1 = v1n()
1098 p2 = p2n()
1099 v2 = v2n()
1101 while True:
1102 if p1 < p2:
1103 # clip a region from pre_p to p1, then set pre_p as p1.
1104 if v2 > 0:
1105 radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1106 vadd(v1)
1107 ladd(p1-pre_p)
1108 pre_p = p1
1109 # call for the next p1 and v1
1110 p1 = p1n()
1111 v1 = v1n()
1112 elif p2 < p1:
1113 # clip a region from pre_p to p2, then set
1114 # pre_p as p2.
1115 if v2 > 0:
1116 radd(str(chrom)+"."+str(pre_p)+"."+str(p2))
1117 vadd(v1)
1118 ladd(p2-pre_p)
1119 pre_p = p2
1120 # call for the next p2 and v2
1121 p2 = p2n()
1122 v2 = v2n()
1123 elif p1 == p2:
1124 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1125 if v2 > 0:
1126 radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1127 vadd(v1)
1128 ladd(p1-pre_p)
1129 pre_p = p1
1130 # call for the next p1, v1, p2, v2.
1131 p1 = p1n()
1132 v1 = v1n()
1133 p2 = p2n()
1134 v2 = v2n()
1135 except StopIteration:
1136 # meet the end of either bedGraphTrackI, simply exit
1137 pass
1139 return ret
1141 @cython.ccall
1142 def extract_value_hmmr(self, bdgTrack2):
1143 """Extract values from regions defined in bedGraphTrackI class object
1144 `bdgTrack2`.
1146 I will try to tweak this function to output only the values of
1147 bdgTrack1 (self) in the regions in bdgTrack2
1149 This is specifically for HMMRATAC. bdgTrack2 should be a
1150 bedgraph object containing the bins with value set to
1151 'mark_bin' -- the bins in the same region will have the same
1152 value.
1154 # pre_p: cython.int
1155 p1: cython.int
1156 p2: cython.int
1157 i: cython.int
1158 v1: cython.float
1159 v2: cython.float
1160 chrom: bytes
1161 ret: list
1163 assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
1165 # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
1166 ret = [[], pyarray('f', []), pyarray('i', [])]
1167 padd = ret[0].append
1168 vadd = ret[1].append
1169 ladd = ret[2].append
1171 chr1 = set(self.get_chr_names())
1172 chr2 = set(bdgTrack2.get_chr_names())
1173 common_chr = sorted(list(chr1.intersection(chr2)))
1174 for i in range(len(common_chr)):
1175 chrom = common_chr.pop()
1176 # arrays for position and values
1177 (p1s, v1s) = self.get_data_by_chr(chrom)
1178 # assign the next function to a viable to speed up
1179 p1n = iter(p1s).__next__
1180 v1n = iter(v1s).__next__
1182 # arrays for position and values
1183 (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
1184 # assign the next function to a viable to speed up
1185 p2n = iter(p2s).__next__
1186 v2n = iter(v2s).__next__
1187 # remember the previous position in the new bedGraphTrackI
1188 # object ret
1189 # pre_p = 0
1190 try:
1191 p1 = p1n()
1192 v1 = v1n()
1194 p2 = p2n()
1195 v2 = v2n()
1197 while True:
1198 if p1 < p2:
1199 # clip a region from pre_p to p1, then set pre_p as p1.
1200 # in this case, we don't output any
1201 # if v2>0:
1202 # radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1203 # vadd(v1)
1204 # ladd(p1-pre_p)
1205 # pre_p = p1
1206 # call for the next p1 and v1
1207 p1 = p1n()
1208 v1 = v1n()
1209 elif p2 < p1:
1210 # clip a region from pre_p to p2, then set pre_p as p2.
1211 if v2 != 0: # 0 means it's a gap region, we should have value > 1
1212 padd((chrom, p2))
1213 vadd(v1)
1214 ladd(int(v2))
1215 # pre_p = p2
1216 # call for the next p2 and v2
1217 p2 = p2n()
1218 v2 = v2n()
1219 elif p1 == p2:
1220 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1221 if v2 != 0: # 0 means it's a gap region, we should have 1 or -1
1222 padd((chrom, p2))
1223 vadd(v1)
1224 ladd(int(v2))
1225 # pre_p = p1
1226 # call for the next p1, v1, p2, v2.
1227 p1 = p1n()
1228 v1 = v1n()
1229 p2 = p2n()
1230 v2 = v2n()
1231 except StopIteration:
1232 # meet the end of either bedGraphTrackI, simply exit
1233 pass
1235 return ret
1237 @cython.ccall
1238 def make_ScoreTrackII_for_macs(self, bdgTrack2,
1239 depth1: float = 1.0,
1240 depth2: float = 1.0):
1241 """A modified overlie function for MACS v2.
1243 effective_depth_in_million: sequencing depth in million after
1244 duplicates being filtered. If
1245 treatment is scaled down to
1246 control sample size, then this
1247 should be control sample size in
1248 million. And vice versa.
1250 Return value is a ScoreTrackII object.
1252 # pre_p: cython.int
1253 p1: cython.int
1254 p2: cython.int
1255 v1: cython.float
1256 v2: cython.float
1257 chrom: bytes
1259 assert isinstance(bdgTrack2, bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
1261 ret = ScoreTrackII(treat_depth=depth1,
1262 ctrl_depth=depth2)
1263 retadd = ret.add
1265 chr1 = set(self.get_chr_names())
1266 chr2 = set(bdgTrack2.get_chr_names())
1267 common_chr = chr1.intersection(chr2)
1268 for chrom in sorted(common_chr):
1269 # arrays for position and values
1270 (p1s, v1s) = self.get_data_by_chr(chrom)
1271 # assign the next function to a viable to speed up
1272 p1n = iter(p1s).__next__
1273 v1n = iter(v1s).__next__
1274 # arrays for position and values
1275 (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
1276 # assign the next function to a viable to speed up
1277 p2n = iter(p2s).__next__
1278 v2n = iter(v2s).__next__
1280 # this is the maximum number of locations needed to be
1281 # recorded in scoreTrackI for this chromosome.
1282 chrom_max_len = len(p1s)+len(p2s)
1284 ret.add_chromosome(chrom, chrom_max_len)
1286 # remember the previous position in the new bedGraphTrackI
1287 # object ret
1288 # pre_p = 0
1290 try:
1291 p1 = p1n()
1292 v1 = v1n()
1294 p2 = p2n()
1295 v2 = v2n()
1297 while True:
1298 if p1 < p2:
1299 # clip a region from pre_p to p1, then set pre_p as p1.
1300 retadd(chrom, p1, v1, v2)
1301 # pre_p = p1
1302 # call for the next p1 and v1
1303 p1 = p1n()
1304 v1 = v1n()
1305 elif p2 < p1:
1306 # clip a region from pre_p to p2, then set pre_p as p2.
1307 retadd(chrom, p2, v1, v2)
1308 # pre_p = p2
1309 # call for the next p2 and v2
1310 p2 = p2n()
1311 v2 = v2n()
1312 elif p1 == p2:
1313 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1314 retadd(chrom, p1, v1, v2)
1315 # pre_p = p1
1316 # call for the next p1, v1, p2, v2.
1317 p1 = p1n()
1318 v1 = v1n()
1319 p2 = p2n()
1320 v2 = v2n()
1321 except StopIteration:
1322 # meet the end of either bedGraphTrackI, simply exit
1323 pass
1325 ret.finalize()
1326 # ret.merge_regions()
1327 return ret
1329 @cython.ccall
1330 def cutoff_analysis(self,
1331 max_gap: cython.int,
1332 min_length: cython.int,
1333 steps: cython.int = 100,
1334 min_score: cython.float = 0,
1335 max_score: cython.float = 1000) -> str:
1337 Cutoff analysis function for bedGraphTrackI object.
1339 This function will try all possible cutoff values on the score
1340 column to call peaks. Then will give a report of a number of
1341 metrics (number of peaks, total length of peaks, average
1342 length of peak) at varying score cutoffs. For each score
1343 cutoff, the function finds the positions where the score
1344 exceeds the cutoff, then groups those positions into "peaks"
1345 based on the maximum allowed gap (max_gap) between consecutive
1346 positions. If a peak's length exceeds the minimum length
1347 (min_length), the peak is counted.
1349 Parameters
1350 ----------
1352 max_gap : int32_t
1353 Maximum allowed gap between consecutive positions above cutoff
1355 min_length : int32_t Minimum length of peak
1356 steps: int32_t
1357 It will be used to calculate 'step' to increase from min_v to
1358 max_v (see below).
1360 min_score: float32_t
1361 Minimum score for cutoff analysis. Note1: we will take the
1362 larger value between the actual minimum value in the BedGraph
1363 and min_score as min_v. Note2: the min_v won't be included in
1364 the final result. We will try to output the smallest cutoff as
1365 min_v+step.
1367 max_score: float32_t
1368 Maximum score for cutoff analysis. Note1: we will take the
1369 smaller value between the actual maximum value in the BedGraph
1370 and max_score as max_v. Note2: the max_v may not be included
1371 in the final result. We will only output the cutoff that can
1372 generate at least 1 peak.
1374 Returns
1375 -------
1377 Cutoff analysis report in str object.
1379 Todos
1380 -----
1382 May need to separate this function out as a class so that we
1383 can add more ways to analyze the result. Also, we can let this
1384 function return a list of dictionary or data.frame in that
1385 way, instead of str object.
1387 chrs: set
1388 peak_content: list
1389 ret_list: list
1390 cutoff_list: list
1391 cutoff_npeaks: list
1392 cutoff_lpeaks: list
1393 chrom: bytes
1394 ret: str
1395 cutoff: cython.float
1396 total_l: cython.long
1397 total_p: cython.long
1398 i: cython.long
1399 n: cython.long
1400 ts: cython.long
1401 te: cython.long
1402 lastp: cython.long
1403 tl: cython.long
1404 peak_length: cython.long
1405 # dict cutoff_npeaks, cutoff_lpeaks
1406 s: cython.float
1408 chrs = self.get_chr_names()
1410 # midvalue = self.minvalue/2 + self.maxvalue/2
1411 # s = float(self.minvalue - midvalue)/steps
1412 minv = max(min_score, self.minvalue)
1413 maxv = min(self.maxvalue, max_score)
1415 s = float(maxv - minv)/steps
1417 # a list of possible cutoff values from minv to maxv with step of s
1418 cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)]
1420 cutoff_npeaks = [0] * len(cutoff_list)
1421 cutoff_lpeaks = [0] * len(cutoff_list)
1423 for chrom in sorted(chrs):
1424 (pos_array, score_array) = self.__data[chrom]
1425 pos_array = np.array(self.__data[chrom][0])
1426 score_array = np.array(self.__data[chrom][1])
1428 for n in range(len(cutoff_list)):
1429 cutoff = cutoff_list[n]
1430 total_l = 0 # total length of peaks
1431 total_p = 0 # total number of peaks
1433 # get the regions with scores above cutoffs. This is
1434 # not an optimized method. It would be better to store
1435 # score array in a 2-D ndarray?
1436 above_cutoff = np.nonzero(score_array > cutoff)[0]
1437 # end positions of regions where score is above cutoff
1438 above_cutoff_endpos = pos_array[above_cutoff]
1439 # start positions of regions where score is above cutoff
1440 above_cutoff_startpos = pos_array[above_cutoff-1]
1442 if above_cutoff_endpos.size == 0:
1443 continue
1445 # first bit of region above cutoff
1446 acs_next = iter(above_cutoff_startpos).__next__
1447 ace_next = iter(above_cutoff_endpos).__next__
1449 ts = acs_next()
1450 te = ace_next()
1451 peak_content = [(ts, te),]
1452 lastp = te
1454 for i in range(1, above_cutoff_startpos.size):
1455 ts = acs_next()
1456 te = ace_next()
1457 tl = ts - lastp
1458 if tl <= max_gap:
1459 peak_content.append((ts, te))
1460 else:
1461 peak_length = peak_content[-1][1] - peak_content[0][0]
1462 # if the peak is too small, reject it
1463 if peak_length >= min_length:
1464 total_l += peak_length
1465 total_p += 1
1466 peak_content = [(ts, te),]
1467 lastp = te
1469 if peak_content:
1470 peak_length = peak_content[-1][1] - peak_content[0][0]
1471 # if the peak is too small, reject it
1472 if peak_length >= min_length:
1473 total_l += peak_length
1474 total_p += 1
1475 cutoff_lpeaks[n] += total_l
1476 cutoff_npeaks[n] += total_p
1478 # prepare the returnning text
1479 ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"]
1480 for n in range(len(cutoff_list)-1, -1, -1):
1481 cutoff = cutoff_list[n]
1482 if cutoff_npeaks[n] > 0:
1483 ret_list.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff,
1484 cutoff_npeaks[n],
1485 cutoff_lpeaks[n],
1486 cutoff_lpeaks[n]/cutoff_npeaks[n]))
1487 ret = ''.join(ret_list)
1488 return ret
1491 @cython.cfunc
1492 def calculate_elbows(values: cnp.ndarray,
1493 threshold: cython.float = 0.01) -> cnp.ndarray:
1494 # although this function is supposed to find elbow pts for cutoff
1495 # analysis, however, in reality, it barely works...
1496 deltas: cnp.ndarray
1497 slopes: cnp.ndarray
1498 delta_slopes: cnp.ndarray
1499 elbows: cnp.ndarray
1500 avg_delta_slope: cython.float
1502 # Calculate the difference between each point and the first point
1503 deltas = values - values[0]
1504 # Calculate the slope between each point and the last point
1505 slopes = deltas / (values[-1] - values[0])
1506 # Calculate the change in slope
1507 delta_slopes = np.diff(slopes)
1508 # Calculate the average change in slope
1509 avg_delta_slope = np.mean(delta_slopes)
1510 # Find all points where the change in slope is significantly
1511 # larger than the average
1512 elbows = np.where(delta_slopes > avg_delta_slope + threshold)[0]
1513 return elbows