1 # cython: language_level=3
3 # Time-stamp: <2024-10-14 19:32:34 Tao Liu>
5 """Module for BedGraph data class.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
16 from array
import array
as pyarray
18 # ------------------------------------
20 # ------------------------------------
21 from MACS3
.Signal
.ScoreTrack
import ScoreTrackII
22 from MACS3
.IO
.PeakIO
import PeakIO
, BroadPeakIO
, PeakContent
23 from MACS3
.Signal
.Prob
import chisq_logp_e
25 # ------------------------------------
27 # ------------------------------------
29 from cython
.cimports
.cpython
import bool
31 import cython
.cimports
.numpy
as cnp
33 # ------------------------------------
35 # ------------------------------------
37 from cython
.cimports
.libc
.math
import sqrt
, log10
39 # ------------------------------------
41 # ------------------------------------
42 LOG10_E
= 0.43429448190325176
44 # ------------------------------------
46 # ------------------------------------
58 # combine -log10pvalues
59 return chisq_logp_e(2*sum(x
)/LOG10_E
, 2*len(x
), log10
=True)
65 # subtraction of two items list
72 # division of two items list
79 # production of a list of values
80 # only python 3.8 or above
83 # ------------------------------------
85 # ------------------------------------
90 """Class for bedGraph type data.
92 In bedGraph, data are represented as continuous non-overlapping
93 regions in the whole genome. I keep this assumption in all the
94 functions. If data has overlaps, some functions will definitely
95 give incorrect results.
97 1. Continuous: the next region should be after the previous one
98 unless they are on different chromosomes;
100 2. Non-overlapping: the next region should never have overlaps
101 with preceding region.
103 The way to memorize bedGraph data is to remember the transition
104 points together with values of their preceding regions. The last
105 data point may exceed chromosome end, unless a chromosome
106 dictionary is given. Remember the coordinations in bedGraph and
107 this class is 0-indexed and right-open.
111 maxvalue
= cython
.declare(cython
.float, visibility
="public")
112 minvalue
= cython
.declare(cython
.float, visibility
="public")
113 baseline_value
= cython
.declare(cython
.float, visibility
="public")
115 def __init__(self
, baseline_value
: cython
.float = 0):
117 baseline_value is the value to fill in the regions not defined
118 in bedGraph. For example, if the bedGraph is like:
123 Then the region chr1:200..250 should be filled with baseline_value.
127 self
.maxvalue
= -10000000 # initial maximum value is tiny since I want safe_add_loc to update it
128 self
.minvalue
= 10000000 # initial minimum value is large since I want safe_add_loc to update it
129 self
.baseline_value
= baseline_value
132 def add_loc(self
, chromosome
: bytes
,
133 startpos
: cython
.int,
135 value
: cython
.float):
136 """Add a chr-start-end-value block into __data dictionary.
138 Note, we don't check if the add_loc is called continuously on
139 sorted regions without any gap. So we only suggest calling
140 this function within MACS.
145 # basic assumption, end pos should > start pos
152 if chromosome
not in self
.__data
:
153 self
.__data
[chromosome
] = [pyarray('i', []),
155 c
= self
.__data
[chromosome
]
157 # start pos is not 0, then add two blocks, the first
158 # with "baseline_value"; the second with "value"
159 c
[0].append(startpos
)
160 c
[1].append(self
.baseline_value
)
164 c
= self
.__data
[chromosome
]
165 # get the preceding region
168 # if this region is next to the previous one.
170 # if value is the same, simply extend it.
173 # otherwise, add a new region
177 if value
> self
.maxvalue
:
178 self
.maxvalue
= value
179 if value
< self
.minvalue
:
180 self
.minvalue
= value
183 def add_loc_wo_merge(self
, chromosome
: bytes
,
184 startpos
: cython
.int,
186 value
: cython
.float):
187 """Add a chr-start-end-value block into __data dictionary.
189 Note, we don't check if the add_loc is called continuously on
190 sorted regions without any gap. So we only suggest calling
191 this function within MACS.
193 This one won't merge nearby ranges with the same value
200 if value
< self
.baseline_value
:
201 value
= self
.baseline_value
203 if chromosome
not in self
.__data
:
204 self
.__data
[chromosome
] = [pyarray('i', []),
206 c
= self
.__data
[chromosome
]
208 # start pos is not 0, then add two blocks, the first
209 # with "baseline_value"; the second with "value"
210 c
[0].append(startpos
)
211 c
[1].append(self
.baseline_value
)
212 c
= self
.__data
[chromosome
]
215 if value
> self
.maxvalue
:
216 self
.maxvalue
= value
217 if value
< self
.minvalue
:
218 self
.minvalue
= value
221 def add_chrom_data(self
,
225 """Add a pv data to a chromosome. Replace the previous data.
227 p: a pyarray object 'i' for positions
228 v: a pyarray object 'f' for values
230 Note: no checks for error, use with caution
235 self
.__data
[chromosome
] = [p
, v
]
238 if maxv
> self
.maxvalue
:
240 if minv
< self
.minvalue
:
245 def add_chrom_data_PV(self
,
248 """Add a pv data to a chromosome. Replace the previous data.
250 This is a kinda silly function to waste time and convert a PV
251 array (2-d named numpy array) into two python arrays for this
252 BedGraph class. May have better function later.
254 Note: no checks for error, use with caution
259 self
.__data
[chromosome
] = [pyarray('i', pv
['p']),
260 pyarray('f', pv
['v'])]
263 if maxv
> self
.maxvalue
:
265 if minv
< self
.minvalue
:
270 def destroy(self
) -> bool:
271 """ destroy content, free memory.
276 chrs
= self
.get_chr_names()
277 for chrom
in sorted(chrs
):
278 if chrom
in self
.__data
:
279 self
.__data
[chrom
] = [None, None]
280 self
.__data
.pop(chrom
)
284 def get_data_by_chr(self
, chromosome
: bytes
) -> list:
285 """Return array of counts by chromosome.
287 The return value is a tuple:
290 if chromosome
in self
.__data
:
291 return self
.__data
[chromosome
]
296 def get_chr_names(self
) -> set:
297 """Return all the chromosome names stored.
300 return set(sorted(self
.__data
.keys()))
303 def reset_baseline(self
, baseline_value
: cython
.float):
304 """Reset baseline value to baseline_value.
306 So any region between self.baseline_value and baseline_value
307 will be set to baseline_value.
310 self
.baseline_value
= baseline_value
311 self
.filter_score(cutoff
=baseline_value
)
316 def merge_regions(self
):
317 """Merge nearby regions with the same value.
320 # new_pre_pos: cython.int
323 new_pre_value
: cython
.float
328 chrs
= self
.get_chr_names()
329 for chrom
in sorted(chrs
):
330 (p
, v
) = self
.__data
[chrom
]
331 pnext
= iter(p
).__next
__
332 vnext
= iter(v
).__next
__
335 new_pos
= pyarray('L', [pnext(),])
336 new_value
= pyarray('f', [vnext(),])
338 newpa
= new_pos
.append
339 newva
= new_value
.append
341 # new_pre_pos = new_pos[0]
342 new_pre_value
= new_value
[0]
344 for i
in range(1, len(p
)):
347 if value
== new_pre_value
:
354 new_pre_value
= value
355 self
.__data
[chrom
] = [new_pos
, new_value
]
359 def filter_score(self
, cutoff
: cython
.float = 0) -> bool:
360 """Filter using a score cutoff. Any region lower than score
361 cutoff will be set to self.baseline_value.
363 Self will be modified.
365 # new_pre_pos: cython.int
368 new_pre_value
: cython
.float
373 chrs
= self
.get_chr_names()
374 for chrom
in sorted(chrs
):
375 (p
, v
) = self
.__data
[chrom
]
376 pnext
= iter(p
).__next
__
377 vnext
= iter(v
).__next
__
380 new_pos
= pyarray('L', [])
381 new_value
= pyarray('f', [])
385 for i
in range(len(p
)):
390 # this region will be set to baseline_value
391 if new_pre_value
== self
.baseline_value
:
392 # if preceding region is at baseline, extend it
395 # else add a new baseline region
397 new_value
.append(self
.baseline_value
)
399 # put it into new arrays
401 new_value
.append(value
)
402 # new_pre_pos = new_pos[-1]
403 new_pre_value
= new_value
[-1]
404 self
.__data
[chrom
] = [new_pos
, new_value
]
408 def summary(self
) -> tuple:
409 """Calculate the sum, total_length, max, min, mean, and std.
411 Return a tuple for (sum, total_length, max, min, mean, std).
419 variance
: cython
.float
431 for (p
, v
) in self
.__data
.values():
432 # for each chromosome
434 for i
in range(len(p
)):
440 max_v
= max(max(v
), max_v
)
441 min_v
= min(min(v
), min_v
)
444 for (p
, v
) in self
.__data
.values():
445 for i
in range(len(p
)):
449 variance
+= tmp
*tmp
*ln
452 variance
/= float(n_v
-1)
453 std_v
= sqrt(variance
)
454 return (sum_v
, n_v
, max_v
, min_v
, mean_v
, std_v
)
458 cutoff
: cython
.float = 1,
459 min_length
: cython
.int = 200,
460 max_gap
: cython
.int = 50,
461 call_summits
: bool = False):
462 """This function try to find regions within which, scores
463 are continuously higher than a given cutoff.
465 This function is NOT using sliding-windows. Instead, any
466 regions in bedGraph above certain cutoff will be detected,
467 then merged if the gap between nearby two regions are below
468 max_gap. After this, peak is reported if its length is above
471 cutoff: cutoff of value, default 1.
472 min_length : minimum peak length, default 200.
473 gap : maximum gap to merge nearby peaks, default 50.
477 up_limit: the highest acceptable value. Default 10^{310}
478 * so only allow peak with value >=cutoff and <=up_limit
480 This does not work. The region above upper limit may still be
484 # peak_length: cython.int
493 chrs
= self
.get_chr_names()
494 peaks
= PeakIO() # dictionary to save peaks
495 for chrom
in sorted(chrs
):
498 (ps
, vs
) = self
.get_data_by_chr(chrom
) # arrays for position and values
499 psn
= iter(ps
).__next
__ # assign the next function to a viable to speed up
500 vsn
= iter(vs
).__next
__
502 pre_p
= 0 # remember previous position
504 # find the first region above cutoff
505 try: # try to read the first data range for this chrom
510 x
+= 1 # index for the next point
512 peak_content
= [(pre_p
, p
, v
),]
514 break # found the first range above cutoff
518 for i
in range(x
, len(ps
)):
519 # continue scan the rest regions
522 if v
< cutoff
: # not be detected as 'peak'
525 # for points above cutoff
526 # if the gap is allowed
527 if pre_p
- peak_content
[-1][1] <= max_gap
:
528 peak_content
.append((pre_p
, p
, v
))
530 # when the gap is not allowed, close this peak
531 self
.__close
_peak
(peak_content
,
534 chrom
) # , smoothlen=max_gap / 2)
536 peak_content
= [(pre_p
, p
, v
),]
542 self
.__close
_peak
(peak_content
,
545 chrom
) # , smoothlen=max_gap / 2)
549 def __close_peak(self
,
552 min_length
: cython
.int,
553 chrom
: bytes
) -> bool:
554 tsummit
: list # list for temporary summits
555 peak_length
: cython
.int
559 summit_value
: cython
.float
561 peak_length
= peak_content
[-1][1]-peak_content
[0][0]
562 if peak_length
>= min_length
: # if the peak is too small, reject it
566 for (tstart
, tend
, tvalue
) in peak_content
:
567 if not summit_value
or summit_value
< tvalue
:
568 tsummit
= [cython
.cast(cython
.int, (tend
+tstart
)/2),]
569 summit_value
= tvalue
570 elif summit_value
== tvalue
:
571 tsummit
.append(cython
.cast(cython
.int, (tend
+tstart
)/2))
572 summit
= tsummit
[cython
.cast(cython
.int, (len(tsummit
)+1)/2)-1]
577 peak_score
=summit_value
,
586 def call_broadpeaks(self
,
587 lvl1_cutoff
: cython
.float = 500,
588 lvl2_cutoff
: cython
.float = 100,
589 min_length
: cython
.int = 200,
590 lvl1_max_gap
: cython
.int = 50,
591 lvl2_max_gap
: cython
.int = 400):
592 """This function try to find enriched regions within which,
593 scores are continuously higher than a given cutoff for level
594 1, and link them using the gap above level 2 cutoff with a
595 maximum length of lvl2_max_gap.
597 lvl1_cutoff: cutoff of value at enriched regions, default 500.
598 lvl2_cutoff: cutoff of value at linkage regions, default 100.
599 min_length : minimum peak length, default 200.
600 lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50.
601 lvl2_max_gap : maximum length of linkage regions, default 400.
602 colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column.
604 Return both general PeakIO object for highly enriched regions
605 and gapped broad regions in BroadPeakIO.
612 lvl2
: PeakContent
# PeakContent class object
616 assert lvl1_cutoff
> lvl2_cutoff
, "level 1 cutoff should be larger than level 2."
617 assert lvl1_max_gap
< lvl2_max_gap
, "level 2 maximum gap should be larger than level 1."
618 lvl1_peaks
= self
.call_peaks(cutoff
=lvl1_cutoff
,
619 min_length
=min_length
,
620 max_gap
=lvl1_max_gap
,
622 lvl2_peaks
= self
.call_peaks(cutoff
=lvl2_cutoff
,
623 min_length
=min_length
,
624 max_gap
=lvl2_max_gap
,
626 chrs
= lvl1_peaks
.get_chr_names()
627 broadpeaks
= BroadPeakIO()
628 # use lvl2_peaks as linking regions between lvl1_peaks
629 for chrom
in sorted(chrs
):
630 lvl1peakschrom
= lvl1_peaks
.get_data_from_chrom(chrom
)
631 lvl2peakschrom
= lvl2_peaks
.get_data_from_chrom(chrom
)
632 lvl1peakschrom_next
= iter(lvl1peakschrom
).__next
__
633 tmppeakset
= [] # to temporarily store lvl1 region inside a lvl2 region
634 # our assumption is lvl1 regions should be included in lvl2 regions
636 lvl1
= lvl1peakschrom_next()
637 for i
in range(len(lvl2peakschrom
)):
638 # for each lvl2 peak, find all lvl1 peaks inside
639 lvl2
= lvl2peakschrom
[i
]
641 if lvl2
["start"] <= lvl1
["start"] and lvl1
["end"] <= lvl2
["end"]:
642 tmppeakset
.append(lvl1
)
643 lvl1
= lvl1peakschrom_next()
645 self
.__add
_broadpeak
(broadpeaks
,
651 except StopIteration:
652 self
.__add
_broadpeak
(broadpeaks
, chrom
, lvl2
, tmppeakset
)
654 for j
in range(i
+1, len(lvl2peakschrom
)):
655 self
.__add
_broadpeak
(broadpeaks
,
662 def __add_broadpeak(self
,
665 lvl2peak
: PeakContent
,
667 """Internal function to create broad peak.
677 start
= lvl2peak
["start"]
678 end
= lvl2peak
["end"]
680 # the following code will add those broad/lvl2 peaks with no
681 # strong/lvl1 peaks inside
684 # will complement by adding 1bps start and end to this region
685 # may change in the future if gappedPeak format was improved.
686 bpeaks
.add(chrom
, start
, end
,
687 score
=lvl2peak
["score"],
688 thickStart
=(b
"%d" % start
),
689 thickEnd
=(b
"%d" % end
),
692 blockStarts
=(b
"0,%d" % (end
-start
-1)),
693 pileup
=lvl2peak
["pileup"],
694 pscore
=lvl2peak
["pscore"],
695 fold_change
=lvl2peak
["fc"],
696 qscore
=lvl2peak
["qscore"])
699 thickStart
= b
"%d" % lvl1peakset
[0]["start"]
700 thickEnd
= b
"%d" % lvl1peakset
[-1]["end"]
701 blockNum
= len(lvl1peakset
)
702 blockSizes
= b
",".join([b
"%d" % x
["length"] for x
in lvl1peakset
])
703 blockStarts
= b
",".join([b
"%d" % (x
["start"]-start
) for x
in lvl1peakset
])
705 if int(thickStart
) != start
:
707 thickStart
= b
"%d" % start
709 blockSizes
= b
"1,"+blockSizes
710 blockStarts
= b
"0,"+blockStarts
711 if int(thickEnd
) != end
:
712 # add 1bp right block
713 thickEnd
= b
"%d" % end
715 blockSizes
= blockSizes
+b
",1"
716 blockStarts
= blockStarts
+ b
"," + (b
"%d" % (end
-start
-1))
718 bpeaks
.add(chrom
, start
, end
,
719 score
=lvl2peak
["score"],
720 thickStart
=thickStart
,
723 blockSizes
=blockSizes
,
724 blockStarts
=blockStarts
,
725 pileup
=lvl2peak
["pileup"],
726 pscore
=lvl2peak
["pscore"],
727 fold_change
=lvl2peak
["fc"],
728 qscore
=lvl2peak
["qscore"])
732 def refine_peaks(self
, peaks
):
733 """This function try to based on given peaks, re-evaluate the
734 peak region, call the summit.
737 return: a new PeakIO object
750 chrs
= self
.get_chr_names()
751 assert isinstance(peaks
, PeakIO
)
752 chrs
= chrs
.intersection(set(peaks
.get_chr_names()))
754 for chrom
in sorted(chrs
):
755 peaks_chr
= peaks
.get_data_from_chrom(chrom
)
757 # arrays for position and values
758 (ps
, vs
) = self
.get_data_by_chr(chrom
)
759 # assign the next function to a viable to speed up
760 psn
= iter(ps
).__next
__
761 vsn
= iter(vs
).__next
__
762 peakn
= iter(peaks_chr
).__next
__
764 # remember previous position in bedgraph/self
769 peak_s
= peak
["start"]
773 if p
> peak_s
and peak_e
> pre_p
:
774 # now put four coordinates together and pick the middle two
775 s
, e
= sorted([p
, peak_s
, peak_e
, pre_p
])[1:3]
777 peak_content
.append((s
, e
, v
))
784 # no more value chunk in bedGraph
786 elif pre_p
>= peak_e
:
788 self
.__close
_peak
(peak_content
, new_peaks
, 0, chrom
)
793 peak_s
= peak
["start"]
805 # no more value chunk in bedGraph
808 raise Exception(f
"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}")
812 self
.__close
_peak
(peak_content
, new_peaks
, 0, chrom
)
816 def total(self
) -> cython
.int:
817 """Return the number of regions in this object.
822 for (p
, v
) in self
.__data
.values():
827 def set_single_value(self
, new_value
: cython
.float):
828 """Change all the values in bedGraph to the same new_value,
829 return a new bedGraphTrackI.
835 ret
= bedGraphTrackI()
836 chroms
= set(self
.get_chr_names())
837 for chrom
in sorted(chroms
):
838 # arrays for position and values
839 (p1
, v1
) = self
.get_data_by_chr(chrom
)
842 # add a region from 0 to max_p
843 ret
.add_loc(chrom
, 0, max_p
, new_value
)
847 def overlie(self
, bdgTracks
, func
: str = "max"):
848 """Calculate two or more bedGraphTrackI objects by letting self
849 overlying bdgTrack2, with user-defined functions.
851 Transition positions from both bedGraphTrackI objects will be
852 considered and combined. For example:
854 #1 bedGraph (self) | #2 bedGraph
855 -----------------------------------------------
856 chr1 0 100 0 | chr1 0 150 1
857 chr1 100 200 3 | chr1 150 250 2
858 chr1 200 300 4 | chr1 250 300 4
860 these two bedGraphs will be combined to have five transition
861 points: 100, 150, 200, 250, and 300. So in order to calculate
862 two bedGraphs, I pair values within the following regions
865 chr s e (#1,#2) applied_func_max
866 -----------------------------------------------
873 Then the given 'func' will be applied on each 2-tuple as func(#1,#2)
875 Supported 'func' are "sum", "subtract" (only for two bdg
876 objects), "product", "divide" (only for two bdg objects),
877 "max", "mean" and "fisher".
879 Return value is a new bedGraphTrackI object.
881 Option: bdgTracks can be a list of bedGraphTrackI objects
886 nr_tracks
= len(bdgTracks
) + 1 # +1 for self
887 assert nr_tracks
>= 2, "Specify at least one more bdg objects."
888 for i
, bdgTrack
in enumerate(bdgTracks
):
889 assert isinstance(bdgTrack
, bedGraphTrackI
), "bdgTrack{} is not a bedGraphTrackI object".format(i
+ 1)
895 elif func
== "fisher":
899 elif func
== "product":
901 elif func
== "subtract":
905 raise Exception(f
"Only one more bdg object is allowed, but provided {nr_tracks-1}")
906 elif func
== "divide":
910 raise Exception(f
"Only one more bdg object is allowed, but provided {nr_tracks-1}")
912 raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
914 ret
= bedGraphTrackI()
916 common_chr
= set(self
.get_chr_names())
917 for track
in bdgTracks
:
918 common_chr
= common_chr
.intersection(set(track
.get_chr_names()))
920 for chrom
in sorted(common_chr
):
921 datas
= [self
.get_data_by_chr(chrom
)]
922 datas
.extend([bdgTracks
[i
].get_data_by_chr(chrom
) for i
in range(len(bdgTracks
))])
924 ps
, vs
, pn
, vn
= [], [], [], []
927 pn
.append(iter(ps
[-1]).__next
__)
929 vn
.append(iter(vs
[-1]).__next
__)
931 pre_p
= 0 # remember the previous position in the new bedGraphTrackI object ret
933 ps_cur
= [pn
[i
]() for i
in range(len(pn
))]
934 vs_cur
= [vn
[i
]() for i
in range(len(pn
))]
937 # get the lowest position
938 lowest_p
= min(ps_cur
)
940 # at least one lowest position, could be multiple
941 locations
= [i
for i
in range(len(ps_cur
)) if ps_cur
[i
] == lowest_p
]
943 # add the data until the interval
944 ret
.add_loc(chrom
, pre_p
, ps_cur
[locations
[0]], f(vs_cur
))
946 pre_p
= ps_cur
[locations
[0]]
947 for index
in locations
:
948 ps_cur
[index
] = pn
[index
]()
949 vs_cur
[index
] = vn
[index
]()
950 except StopIteration:
951 # meet the end of either bedGraphTrackI, simply exit
956 def apply_func(self
, func
) -> bool:
957 """Apply function 'func' to every value in this bedGraphTrackI object.
959 *Two adjacent regions with same value after applying func will
964 for (p
, s
) in self
.__data
.values():
965 for i
in range(len(s
)):
967 self
.maxvalue
= func(self
.maxvalue
)
968 self
.minvalue
= func(self
.minvalue
)
973 """Convert pvalue scores to qvalue scores.
975 *Assume scores in this bedGraph are pvalue scores! Not work
976 for other type of scores.
980 pscore_array
: pyarray
981 pvalue_stat
: dict = {}
988 nhcal
: cython
.long = 0
993 # pre_v: cython.float
1000 # calculate frequencies of each p-score
1001 for chrom
in sorted(self
.get_chr_names()):
1004 [pos_array
, pscore_array
] = self
.__data
[chrom
]
1006 pn
= iter(pos_array
).__next
__
1007 vn
= iter(pscore_array
).__next
__
1009 for i
in range(len(pos_array
)):
1012 this_l
= this_p
- pre_p
1013 if this_v
in pvalue_stat
:
1014 pvalue_stat
[this_v
] += this_l
1016 pvalue_stat
[this_v
] = this_l
1019 # nhcal += len(pos_array)
1023 N
= sum(pvalue_stat
.values()) # total length
1026 # pre_v = -2147483647
1028 pre_q
= 2147483647 # save the previous q-value
1030 # calculate qscore for each pscore
1032 unique_values
= sorted(pvalue_stat
.keys(), reverse
=True)
1033 for i
in range(len(unique_values
)):
1034 v
= unique_values
[i
]
1035 # l = pvalue_stat[v]
1036 q
= v
+ (log10(k
) + f
)
1037 q
= max(0, min(pre_q
, q
)) # make q-score monotonic
1044 # convert pscore to qscore
1045 for chrom
in sorted(self
.get_chr_names()):
1046 [pos_array
, pscore_array
] = self
.__data
[chrom
]
1048 for i
in range(len(pos_array
)):
1049 pscore_array
[i
] = pqtable
[pscore_array
[i
]]
1051 self
.merge_regions()
1055 def extract_value(self
, bdgTrack2
):
1056 """Extract values from regions defined in bedGraphTrackI class object
1068 assert isinstance(bdgTrack2
, bedGraphTrackI
), "not a bedGraphTrackI object"
1070 # 1: region in bdgTrack2; 2: value; 3: length with the value
1071 ret
= [[], pyarray('f', []), pyarray('L', [])]
1072 radd
= ret
[0].append
1073 vadd
= ret
[1].append
1074 ladd
= ret
[2].append
1076 chr1
= set(self
.get_chr_names())
1077 chr2
= set(bdgTrack2
.get_chr_names())
1078 common_chr
= chr1
.intersection(chr2
)
1079 for i
in range(len(common_chr
)):
1080 chrom
= common_chr
.pop()
1081 (p1s
, v1s
) = self
.get_data_by_chr(chrom
) # arrays for position and values
1082 # assign the next function to a viable to speed up
1083 p1n
= iter(p1s
).__next
__
1084 v1n
= iter(v1s
).__next
__
1086 # arrays for position and values
1087 (p2s
, v2s
) = bdgTrack2
.get_data_by_chr(chrom
)
1088 # assign the next function to a viable to speed up
1089 p2n
= iter(p2s
).__next
__
1090 v2n
= iter(v2s
).__next
__
1091 # remember the previous position in the new bedGraphTrackI
1103 # clip a region from pre_p to p1, then set pre_p as p1.
1105 radd(str(chrom
)+"."+str(pre_p
)+"."+str(p1
))
1109 # call for the next p1 and v1
1113 # clip a region from pre_p to p2, then set
1116 radd(str(chrom
)+"."+str(pre_p
)+"."+str(p2
))
1120 # call for the next p2 and v2
1124 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1126 radd(str(chrom
)+"."+str(pre_p
)+"."+str(p1
))
1130 # call for the next p1, v1, p2, v2.
1135 except StopIteration:
1136 # meet the end of either bedGraphTrackI, simply exit
1142 def extract_value_hmmr(self
, bdgTrack2
):
1143 """Extract values from regions defined in bedGraphTrackI class object
1146 I will try to tweak this function to output only the values of
1147 bdgTrack1 (self) in the regions in bdgTrack2
1149 This is specifically for HMMRATAC. bdgTrack2 should be a
1150 bedgraph object containing the bins with value set to
1151 'mark_bin' -- the bins in the same region will have the same
1163 assert isinstance(bdgTrack2
, bedGraphTrackI
), "not a bedGraphTrackI object"
1165 # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
1166 ret
= [[], pyarray('f', []), pyarray('i', [])]
1167 padd
= ret
[0].append
1168 vadd
= ret
[1].append
1169 ladd
= ret
[2].append
1171 chr1
= set(self
.get_chr_names())
1172 chr2
= set(bdgTrack2
.get_chr_names())
1173 common_chr
= sorted(list(chr1
.intersection(chr2
)))
1174 for i
in range(len(common_chr
)):
1175 chrom
= common_chr
.pop()
1176 # arrays for position and values
1177 (p1s
, v1s
) = self
.get_data_by_chr(chrom
)
1178 # assign the next function to a viable to speed up
1179 p1n
= iter(p1s
).__next
__
1180 v1n
= iter(v1s
).__next
__
1182 # arrays for position and values
1183 (p2s
, v2s
) = bdgTrack2
.get_data_by_chr(chrom
)
1184 # assign the next function to a viable to speed up
1185 p2n
= iter(p2s
).__next
__
1186 v2n
= iter(v2s
).__next
__
1187 # remember the previous position in the new bedGraphTrackI
1199 # clip a region from pre_p to p1, then set pre_p as p1.
1200 # in this case, we don't output any
1202 # radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
1206 # call for the next p1 and v1
1210 # clip a region from pre_p to p2, then set pre_p as p2.
1211 if v2
!= 0: # 0 means it's a gap region, we should have value > 1
1216 # call for the next p2 and v2
1220 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1221 if v2
!= 0: # 0 means it's a gap region, we should have 1 or -1
1226 # call for the next p1, v1, p2, v2.
1231 except StopIteration:
1232 # meet the end of either bedGraphTrackI, simply exit
1238 def make_ScoreTrackII_for_macs(self
, bdgTrack2
,
1239 depth1
: float = 1.0,
1240 depth2
: float = 1.0):
1241 """A modified overlie function for MACS v2.
1243 effective_depth_in_million: sequencing depth in million after
1244 duplicates being filtered. If
1245 treatment is scaled down to
1246 control sample size, then this
1247 should be control sample size in
1248 million. And vice versa.
1250 Return value is a ScoreTrackII object.
1259 assert isinstance(bdgTrack2
, bedGraphTrackI
), "bdgTrack2 is not a bedGraphTrackI object"
1261 ret
= ScoreTrackII(treat_depth
=depth1
,
1265 chr1
= set(self
.get_chr_names())
1266 chr2
= set(bdgTrack2
.get_chr_names())
1267 common_chr
= chr1
.intersection(chr2
)
1268 for chrom
in sorted(common_chr
):
1269 # arrays for position and values
1270 (p1s
, v1s
) = self
.get_data_by_chr(chrom
)
1271 # assign the next function to a viable to speed up
1272 p1n
= iter(p1s
).__next
__
1273 v1n
= iter(v1s
).__next
__
1274 # arrays for position and values
1275 (p2s
, v2s
) = bdgTrack2
.get_data_by_chr(chrom
)
1276 # assign the next function to a viable to speed up
1277 p2n
= iter(p2s
).__next
__
1278 v2n
= iter(v2s
).__next
__
1280 # this is the maximum number of locations needed to be
1281 # recorded in scoreTrackI for this chromosome.
1282 chrom_max_len
= len(p1s
)+len(p2s
)
1284 ret
.add_chromosome(chrom
, chrom_max_len
)
1286 # remember the previous position in the new bedGraphTrackI
1299 # clip a region from pre_p to p1, then set pre_p as p1.
1300 retadd(chrom
, p1
, v1
, v2
)
1302 # call for the next p1 and v1
1306 # clip a region from pre_p to p2, then set pre_p as p2.
1307 retadd(chrom
, p2
, v1
, v2
)
1309 # call for the next p2 and v2
1313 # from pre_p to p1 or p2, then set pre_p as p1 or p2.
1314 retadd(chrom
, p1
, v1
, v2
)
1316 # call for the next p1, v1, p2, v2.
1321 except StopIteration:
1322 # meet the end of either bedGraphTrackI, simply exit
1326 # ret.merge_regions()
1330 def cutoff_analysis(self
,
1331 max_gap
: cython
.int,
1332 min_length
: cython
.int,
1333 steps
: cython
.int = 100,
1334 min_score
: cython
.float = 0,
1335 max_score
: cython
.float = 1000) -> str:
1337 Cutoff analysis function for bedGraphTrackI object.
1339 This function will try all possible cutoff values on the score
1340 column to call peaks. Then will give a report of a number of
1341 metrics (number of peaks, total length of peaks, average
1342 length of peak) at varying score cutoffs. For each score
1343 cutoff, the function finds the positions where the score
1344 exceeds the cutoff, then groups those positions into "peaks"
1345 based on the maximum allowed gap (max_gap) between consecutive
1346 positions. If a peak's length exceeds the minimum length
1347 (min_length), the peak is counted.
1353 Maximum allowed gap between consecutive positions above cutoff
1355 min_length : int32_t Minimum length of peak
1357 It will be used to calculate 'step' to increase from min_v to
1360 min_score: float32_t
1361 Minimum score for cutoff analysis. Note1: we will take the
1362 larger value between the actual minimum value in the BedGraph
1363 and min_score as min_v. Note2: the min_v won't be included in
1364 the final result. We will try to output the smallest cutoff as
1367 max_score: float32_t
1368 Maximum score for cutoff analysis. Note1: we will take the
1369 smaller value between the actual maximum value in the BedGraph
1370 and max_score as max_v. Note2: the max_v may not be included
1371 in the final result. We will only output the cutoff that can
1372 generate at least 1 peak.
1377 Cutoff analysis report in str object.
1382 May need to separate this function out as a class so that we
1383 can add more ways to analyze the result. Also, we can let this
1384 function return a list of dictionary or data.frame in that
1385 way, instead of str object.
1395 cutoff
: cython
.float
1396 total_l
: cython
.long
1397 total_p
: cython
.long
1404 peak_length
: cython
.long
1405 # dict cutoff_npeaks, cutoff_lpeaks
1408 chrs
= self
.get_chr_names()
1410 # midvalue = self.minvalue/2 + self.maxvalue/2
1411 # s = float(self.minvalue - midvalue)/steps
1412 minv
= max(min_score
, self
.minvalue
)
1413 maxv
= min(self
.maxvalue
, max_score
)
1415 s
= float(maxv
- minv
)/steps
1417 # a list of possible cutoff values from minv to maxv with step of s
1418 cutoff_list
= [round(value
, 3) for value
in np
.arange(minv
, maxv
, s
)]
1420 cutoff_npeaks
= [0] * len(cutoff_list
)
1421 cutoff_lpeaks
= [0] * len(cutoff_list
)
1423 for chrom
in sorted(chrs
):
1424 (pos_array
, score_array
) = self
.__data
[chrom
]
1425 pos_array
= np
.array(self
.__data
[chrom
][0])
1426 score_array
= np
.array(self
.__data
[chrom
][1])
1428 for n
in range(len(cutoff_list
)):
1429 cutoff
= cutoff_list
[n
]
1430 total_l
= 0 # total length of peaks
1431 total_p
= 0 # total number of peaks
1433 # get the regions with scores above cutoffs. This is
1434 # not an optimized method. It would be better to store
1435 # score array in a 2-D ndarray?
1436 above_cutoff
= np
.nonzero(score_array
> cutoff
)[0]
1437 # end positions of regions where score is above cutoff
1438 above_cutoff_endpos
= pos_array
[above_cutoff
]
1439 # start positions of regions where score is above cutoff
1440 above_cutoff_startpos
= pos_array
[above_cutoff
-1]
1442 if above_cutoff_endpos
.size
== 0:
1445 # first bit of region above cutoff
1446 acs_next
= iter(above_cutoff_startpos
).__next
__
1447 ace_next
= iter(above_cutoff_endpos
).__next
__
1451 peak_content
= [(ts
, te
),]
1454 for i
in range(1, above_cutoff_startpos
.size
):
1459 peak_content
.append((ts
, te
))
1461 peak_length
= peak_content
[-1][1] - peak_content
[0][0]
1462 # if the peak is too small, reject it
1463 if peak_length
>= min_length
:
1464 total_l
+= peak_length
1466 peak_content
= [(ts
, te
),]
1470 peak_length
= peak_content
[-1][1] - peak_content
[0][0]
1471 # if the peak is too small, reject it
1472 if peak_length
>= min_length
:
1473 total_l
+= peak_length
1475 cutoff_lpeaks
[n
] += total_l
1476 cutoff_npeaks
[n
] += total_p
1478 # prepare the returnning text
1479 ret_list
= ["score\tnpeaks\tlpeaks\tavelpeak\n"]
1480 for n
in range(len(cutoff_list
)-1, -1, -1):
1481 cutoff
= cutoff_list
[n
]
1482 if cutoff_npeaks
[n
] > 0:
1483 ret_list
.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff
,
1486 cutoff_lpeaks
[n
]/cutoff_npeaks
[n
]))
1487 ret
= ''.join(ret_list
)
1492 def calculate_elbows(values
: cnp
.ndarray
,
1493 threshold
: cython
.float = 0.01) -> cnp
.ndarray
:
1494 # although this function is supposed to find elbow pts for cutoff
1495 # analysis, however, in reality, it barely works...
1498 delta_slopes
: cnp
.ndarray
1500 avg_delta_slope
: cython
.float
1502 # Calculate the difference between each point and the first point
1503 deltas
= values
- values
[0]
1504 # Calculate the slope between each point and the last point
1505 slopes
= deltas
/ (values
[-1] - values
[0])
1506 # Calculate the change in slope
1507 delta_slopes
= np
.diff(slopes
)
1508 # Calculate the average change in slope
1509 avg_delta_slope
= np
.mean(delta_slopes
)
1510 # Find all points where the change in slope is significantly
1511 # larger than the average
1512 elbows
= np
.where(delta_slopes
> avg_delta_slope
+ threshold
)[0]