1 # cython: language_level=3
3 # Time-stamp: <2024-10-10 16:45:13 Tao Liu>
5 """Module for Feature IO classes.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
16 from functools
import reduce
18 # ------------------------------------
20 # ------------------------------------
21 from MACS3
.Signal
.SignalProcessing
import maxima
, enforce_valleys
, enforce_peakyness
22 from MACS3
.Signal
.Prob
import poisson_cdf
23 from MACS3
.IO
.PeakIO
import PeakIO
, BroadPeakIO
25 # ------------------------------------
27 # ------------------------------------
31 from numpy cimport uint8_t
, uint16_t
, uint32_t
, uint64_t
, int8_t
, int16_t
, int32_t
, int64_t
, float32_t
, float64_t
32 from cpython cimport bool
33 from cykhash
import PyObjectMap
, Float32to32Map
35 # ------------------------------------
37 # ------------------------------------
38 from libc
.math cimport log10
,log
, floor
, ceil
40 # ------------------------------------
42 # ------------------------------------
43 __version__
= "scoreTrack $Revision$"
44 __author__
= "Tao Liu <vladimir.liu@gmail.com>"
45 __doc__
= "scoreTrack classes"
47 # ------------------------------------
49 # ------------------------------------
50 cdef inline int32_t int_max
(int32_t a
, int32_t b
): return a
if a
>= b
else b
51 cdef inline int32_t int_min
(int32_t a
, int32_t b
): return a
if a
<= b
else b
53 LOG10_E
= 0.43429448190325176
55 pscore_dict
= PyObjectMap
()
57 cdef float32_t get_pscore
( int32_t observed
, float32_t expectation
):
58 """Get p-value score from Poisson test. First check existing
59 table, if failed, call poisson_cdf function, then store the result
67 return pscore_dict
[(observed
, expectation
)]
69 score
= -1*poisson_cdf
(observed
,expectation
,False
,True
)
70 pscore_dict
[(observed
, expectation
)] = score
73 asym_logLR_dict
= PyObjectMap
()
75 cdef float32_t logLR_asym
( float32_t x
, float32_t y
):
76 """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
77 chromatin bias ). Set minus sign for depletion.
85 if (x
,y
) in asym_logLR_dict
:
86 return asym_logLR_dict
[ ( x
, y
) ]
89 s
= (x
*(log
(x
)-log
(y
))+y
-x
)*LOG10_E
91 s
= (x
*(-log
(x
)+log
(y
))-y
+x
)*LOG10_E
94 asym_logLR_dict
[ ( x
, y
) ] = s
97 sym_logLR_dict
= PyObjectMap
()
99 cdef float32_t logLR_sym
( float32_t x
, float32_t y
):
100 """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
101 another enriched ). Set minus sign for H0>H1.
103 * symmetric version *
109 if (x
,y
) in sym_logLR_dict
:
110 return sym_logLR_dict
[ ( x
, y
) ]
113 s
= (x
*(log
(x
)-log
(y
))+y
-x
)*LOG10_E
115 s
= (y
*(log
(x
)-log
(y
))+y
-x
)*LOG10_E
118 sym_logLR_dict
[ ( x
, y
) ] = s
121 cdef float32_t get_logFE
( float32_t x
, float32_t y
):
122 """ return 100* log10 fold enrichment with +1 pseudocount.
126 cdef float32_t get_subtraction
( float32_t x
, float32_t y
):
127 """ return subtraction.
131 # ------------------------------------
133 # ------------------------------------
135 cdef class ScoreTrackII
:
136 """Class for a container to keep signals of each genomic position,
137 including 1. score, 2. treatment and 2. control pileup.
139 It also contains scoring methods and call_peak functions.
142 dict data
# dictionary for data of each chromosome
143 dict datalength
# length of data array of each chromosome
144 bool trackline
# whether trackline should be saved in bedGraph
145 float32_t treat_edm
# seq depth in million of treatment
146 float32_t ctrl_edm
# seq depth in million of control
147 char scoring_method
# method for calculating scores.
148 char normalization_method
# scale to control? scale to treatment? both scale to 1million reads?
149 float32_t pseudocount
# the pseudocount used to calcuate logLR, FE or logFE
151 dict pvalue_stat
# save pvalue<->length dictionary
154 def __init__
(self, float32_t treat_depth
, float32_t ctrl_depth
, float32_t pseudocount
= 1.0 ):
157 treat_depth and ctrl_depth are effective depth in million:
158 sequencing depth in million after
159 duplicates being filtered. If
160 treatment is scaled down to
161 control sample size, then this
162 should be control sample size in
163 million. And vice versa.
165 pseudocount: a pseudocount used to calculate logLR, FE or
166 logFE. Please note this value will not be changed
167 with normalization method. So if you really want
168 to set pseudocount 1 per million reads, set it
169 after you normalize treat and control by million
170 reads by `change_normalizetion_method(ord('M'))`.
173 self.data
= {} # for each chromosome, there is a l*4
174 # matrix. First column: end position
175 # of a region; Second: treatment
176 # pileup; third: control pileup ;
177 # forth: score ( can be
178 # p/q-value/likelihood
179 # ratio/fold-enrichment/subtraction
180 # depending on -c setting)
182 self.trackline
= False
183 self.treat_edm
= treat_depth
184 self.ctrl_edm
= ctrl_depth
185 #scoring_method: p: -log10 pvalue;
187 # l: log10 likelihood ratio ( minus for depletion )
188 # f: log10 fold enrichment
189 # F: linear fold enrichment
191 # m: fragment pileup per million reads
193 self.scoring_method
= ord("N")
195 #normalization_method: T: scale to depth of treatment;
196 # C: scale to depth of control;
197 # M: scale to depth of 1 million;
198 # N: not set/ raw pileup
199 self.normalization_method
= ord("N")
201 self.pseudocount
= pseudocount
202 self.pvalue_stat
= {}
204 cpdef set_pseudocount
( self, float32_t pseudocount
):
205 self.pseudocount
= pseudocount
207 cpdef enable_trackline
( self ):
208 """Turn on trackline with bedgraph output
210 self.trackline
= True
212 cpdef add_chromosome
( self, bytes chrom
, int32_t chrom_max_len
):
214 chrom: chromosome name
215 chrom_max_len: maximum number of data points in this chromosome
218 if chrom
not in self.data
:
219 #self.data[chrom] = np.zeros( ( chrom_max_len, 4 ), dtype="int32" ) # remember col #2-4 is actual value * 100, I use integer here.
220 self.data
[chrom
] = [ np
.zeros
( chrom_max_len
, dtype
="int32" ), # pos
221 np
.zeros
( chrom_max_len
, dtype
="float32" ), # pileup at each interval, in float32 format
222 np
.zeros
( chrom_max_len
, dtype
="float32" ), # control at each interval, in float32 format
223 np
.zeros
( chrom_max_len
, dtype
="float32" ) ] # score at each interval, in float32 format
224 self.datalength
[chrom
] = 0
226 cpdef add
(self, bytes chromosome
, int32_t endpos
, float32_t chip
, float32_t control
):
227 """Add a chr-endpos-sample-control block into data
230 chromosome: chromosome name in string
231 endpos : end position of each interval in integer
232 chip : ChIP pileup value of each interval in float
233 control : Control pileup value of each interval in float
235 *Warning* Need to add regions continuously.
238 i
= self.datalength
[chromosome
]
239 c
= self.data
[chromosome
]
243 self.datalength
[chromosome
] += 1
245 cpdef finalize
( self ):
247 Adjust array size of each chromosome.
254 for chrom
in sorted
(self.data
.keys
()):
256 l
= self.datalength
[chrom
]
257 d
[0].resize
( l
, refcheck
= False
)
258 d
[1].resize
( l
, refcheck
= False
)
259 d
[2].resize
( l
, refcheck
= False
)
260 d
[3].resize
( l
, refcheck
= False
)
263 cpdef get_data_by_chr
(self, bytes chromosome
):
264 """Return array of counts by chromosome.
266 The return value is a tuple:
269 if chromosome
in self.data
:
270 return self.data
[chromosome
]
274 cpdef get_chr_names
(self):
275 """Return all the chromosome names stored.
278 l
= set
(self.data
.keys
())
281 cpdef change_normalization_method
( self, char normalization_method
):
282 """Change/set normalization method. However, I do not
283 recommend change this back and forward, since some precision
284 issue will happen -- I only keep two digits.
286 normalization_method: T: scale to depth of treatment;
287 C: scale to depth of control;
288 M: scale to depth of 1 million;
289 N: not set/ raw pileup
291 if normalization_method
== ord('T'):
292 if self.normalization_method
== ord('T'): # do nothing
294 elif self.normalization_method
== ord('C'):
295 self.normalize
( self.treat_edm
/self.ctrl_edm
, self.treat_edm
/self.ctrl_edm
)
296 elif self.normalization_method
== ord('M'):
297 self.normalize
( self.treat_edm
, self.treat_edm
)
298 elif self.normalization_method
== ord('N'):
299 self.normalize
( 1, self.treat_edm
/self.ctrl_edm
)
302 self.normalization_method
= ord('T')
303 elif normalization_method
== ord('C'):
304 if self.normalization_method
== ord('T'):
305 self.normalize
( self.ctrl_edm
/self.treat_edm
, self.ctrl_edm
/self.treat_edm
)
306 elif self.normalization_method
== ord('C'): # do nothing
308 elif self.normalization_method
== ord('M'):
309 self.normalize
( self.ctrl_edm
, self.ctrl_edm
)
310 elif self.normalization_method
== ord('N'):
311 self.normalize
( self.ctrl_edm
/self.treat_edm
, 1 )
314 self.normalization_method
= ord('C')
315 elif normalization_method
== ord('M'):
316 if self.normalization_method
== ord('T'):
317 self.normalize
( 1/self.treat_edm
, 1/self.treat_edm
)
318 elif self.normalization_method
== ord('C'):
319 self.normalize
( 1/self.ctrl_edm
, 1/self.ctrl_edm
)
320 elif self.normalization_method
== ord('M'): # do nothing
322 elif self.normalization_method
== ord('N'):
323 self.normalize
( 1/self.treat_edm
, 1/self.ctrl_edm
)
326 self.normalization_method
= ord('M')
327 elif normalization_method
== ord('N'):
328 if self.normalization_method
== ord('T'):
329 self.normalize
( self.treat_edm
, self.treat_edm
)
330 elif self.normalization_method
== ord('C'):
331 self.normalize
( self.ctrl_edm
, self.ctrl_edm
)
332 elif self.normalization_method
== ord('M'):
333 self.normalize
( self.treat_edm
, self.ctrl_edm
)
334 elif self.normalization_method
== ord('N'): # do nothing
338 self.normalization_method
= ord('N')
340 cdef normalize
( self, float32_t treat_scale
, float32_t control_scale
):
345 for chrom
in sorted
(self.data
.keys
()):
346 p
= self.data
[chrom
][1]
347 c
= self.data
[chrom
][2]
348 l
= self.datalength
[chrom
]
350 p
[ i
] *= treat_scale
351 c
[ i
] *= control_scale
354 cpdef change_score_method
(self, char scoring_method
):
356 scoring_method: p: -log10 pvalue;
358 l: log10 likelihood ratio ( minus for depletion )
359 s: symmetric log10 likelihood ratio ( for comparing two ChIPs )
360 f: log10 fold enrichment
361 F: linear fold enrichment
364 m: fragment pileup per million reads
366 if scoring_method
== ord('p'):
367 self.compute_pvalue
()
368 elif scoring_method
== ord('q'):
369 #if not already calculated p, compute pvalue first
370 if self.scoring_method
!= ord('p'):
371 self.compute_pvalue
()
372 self.compute_qvalue
()
373 elif scoring_method
== ord('l'):
374 self.compute_likelihood
()
375 elif scoring_method
== ord('s'):
376 self.compute_sym_likelihood
()
377 elif scoring_method
== ord('f'):
379 elif scoring_method
== ord('F'):
380 self.compute_foldenrichment
()
381 elif scoring_method
== ord('d'):
382 self.compute_subtraction
()
383 elif scoring_method
== ord('m'):
385 elif scoring_method
== ord('M'):
390 cdef compute_pvalue
( self ):
391 """Compute -log_{10}(pvalue)
394 np
.ndarray
[np
.float32_t
] p
, c
, v
395 np
.ndarray
[np
.int32_t
] pos
396 int64_t l
, i
, prev_pos
399 for chrom
in sorted
(self.data
.keys
()):
401 pos
= self.data
[chrom
][0]
402 p
= self.data
[chrom
][1]
403 c
= self.data
[chrom
][2]
404 v
= self.data
[chrom
][3]
405 l
= self.datalength
[chrom
]
407 v
[ i
] = get_pscore
( <int32_t
>(p
[ i
] + self.pseudocount
) , c
[ i
] + self.pseudocount
)
409 self.pvalue_stat
[v
[ i
]] += pos
[ i
] - prev_pos
411 self.pvalue_stat
[v
[ i
]] = pos
[ i
] - prev_pos
414 self.scoring_method
= ord('p')
417 cdef compute_qvalue
( self ):
418 """Compute -log_{10}(qvalue)
426 # pvalue should be computed first!
427 assert self.scoring_method
== ord('p')
429 pqtable
= self.make_pq_table
()
432 for chrom
in sorted
(self.data
.keys
()):
433 v
= self.data
[chrom
][3]
434 l
= self.datalength
[chrom
]
436 v
[ i
] = pqtable
[ v
[ i
] ]
437 #v [ i ] = g( v[ i ])
439 self.scoring_method
= ord('q')
442 cpdef
object make_pq_table
( self ):
443 """Make pvalue-qvalue table.
445 Step1: get all pvalue and length of block with this pvalue
447 Step3: Apply AFDR method to adjust pvalue and get qvalue for each pvalue
449 Return a dictionary of {-log10pvalue:(-log10qvalue,rank,basepairs)} relationships.
452 int64_t n
, pre_p
, this_p
, length
, pre_l
, l
, i
, j
453 float32_t this_v
, pre_v
, v
, q
, pre_q
# store the p and q scores
457 np
.ndarray v_chrom
, pos_chrom
462 assert self.scoring_method
== ord('p')
464 pvalue_stat
= self.pvalue_stat
466 N
= sum
(pvalue_stat
.values
())
471 pre_q
= 2147483647 # save the previous q-value
473 pvalue2qvalue
= Float32to32Map
( for_int
= False
)
474 unique_values
= sorted
(list(pvalue_stat
.keys
()), reverse
=True
)
475 for i
in range(len(unique_values
)):
478 q
= v
+ (log10
(k
) + f
)
484 pvalue2qvalue
[ v
] = q
487 # bottom rank pscores all have qscores 0
488 for j
in range(i
, len(unique_values
) ):
489 v
= unique_values
[ j
]
490 pvalue2qvalue
[ v
] = 0
493 cdef compute_likelihood
( self ):
494 """Calculate log10 likelihood.
502 float32_t pseudocount
504 pseudocount
= self.pseudocount
506 for chrom
in sorted
(self.data
.keys
()):
507 p
= self.data
[chrom
][ 1 ].flat
.__next__
# pileup in treatment
508 c
= self.data
[chrom
][ 2 ].flat
.__next__
# pileup in control
509 v
= self.data
[chrom
][ 3 ] # score
510 l
= self.datalength
[chrom
]
516 v
[ i
] = logLR_asym
( v1
+ pseudocount
, v2
+ pseudocount
) #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
518 self.scoring_method
= ord('l')
521 cdef compute_sym_likelihood
( self ):
522 """Calculate symmetric log10 likelihood.
530 float32_t pseudocount
532 pseudocount
= self.pseudocount
534 for chrom
in sorted
(self.data
.keys
()):
535 p
= self.data
[chrom
][ 1 ].flat
.__next__
536 c
= self.data
[chrom
][ 2 ].flat
.__next__
537 v
= self.data
[chrom
][ 3 ]
538 l
= self.datalength
[chrom
]
544 v
[ i
] = logLR_sym
( v1
+ pseudocount
, v2
+ pseudocount
) #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
545 self.scoring_method
= ord('s')
548 cdef compute_logFE
( self ):
549 """Calculate log10 fold enrichment ( with 1 pseudocount ).
555 float32_t pseudocount
557 pseudocount
= self.pseudocount
559 for chrom
in sorted
(self.data
.keys
()):
560 p
= self.data
[chrom
][1]
561 c
= self.data
[chrom
][2]
562 v
= self.data
[chrom
][3]
563 l
= self.datalength
[chrom
]
565 v
[ i
] = get_logFE
( p
[ i
] + pseudocount
, c
[ i
] + pseudocount
)
566 self.scoring_method
= ord('f')
569 cdef compute_foldenrichment
( self ):
570 """Calculate linear scale fold enrichment ( with 1 pseudocount ).
576 float32_t pseudocount
578 pseudocount
= self.pseudocount
580 for chrom
in sorted
(self.data
.keys
()):
581 p
= self.data
[chrom
][1]
582 c
= self.data
[chrom
][2]
583 v
= self.data
[chrom
][3]
584 l
= self.datalength
[chrom
]
586 v
[ i
] = ( p
[ i
] + pseudocount
)/( c
[ i
] + pseudocount
)
587 self.scoring_method
= ord('F')
590 cdef compute_subtraction
( self ):
595 for chrom
in sorted
(self.data
.keys
()):
596 p
= self.data
[chrom
][1]
597 c
= self.data
[chrom
][2]
598 v
= self.data
[chrom
][3]
599 l
= self.datalength
[chrom
]
601 v
[ i
] = p
[ i
] - c
[ i
]
602 self.scoring_method
= ord('d')
605 cdef compute_SPMR
( self ):
610 if self.normalization_method
== ord('T') or self.normalization_method
== ord('N'):
611 scale
= self.treat_edm
612 elif self.normalization_method
== ord('C'):
613 scale
= self.ctrl_edm
614 elif self.normalization_method
== ord('M'):
617 for chrom
in sorted
(self.data
.keys
()):
618 p
= self.data
[chrom
][1]
619 v
= self.data
[chrom
][3]
620 l
= self.datalength
[chrom
]
622 v
[ i
] = p
[ i
] / scale
# two digit precision may not be enough...
623 self.scoring_method
= ord('m')
626 cdef compute_max
( self ):
631 for chrom
in sorted
(self.data
.keys
()):
632 p
= self.data
[chrom
][1]
633 c
= self.data
[chrom
][2]
634 v
= self.data
[chrom
][3]
635 l
= self.datalength
[chrom
]
637 v
[ i
] = max(p
[ i
],c
[ i
])
638 self.scoring_method
= ord('M')
641 cpdef write_bedGraph
( self, fhd
, str name
, str description
, short column
= 3):
642 """Write all data to fhd in bedGraph Format.
644 fhd: a filehandler to save bedGraph.
646 name/description: the name and description in track line.
648 colname: can be 1: chip, 2: control, 3: score
656 np
.ndarray pos
, value
658 assert column
in range( 1, 4 ), "column should be between 1, 2 or 3."
663 # this line is REQUIRED by the wiggle format for UCSC browser
664 write
( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name
.decode
(), description
) )
666 chrs
= self.get_chr_names
()
667 for chrom
in sorted
(chrs
):
668 pos
= self.data
[ chrom
][ 0 ]
669 value
= self.data
[ chrom
][ column
]
670 l
= self.datalength
[ chrom
]
672 if pos
.shape
[ 0 ] == 0: continue # skip if there's no data
674 for i
in range( 1, l
):
677 #if ('%.5f' % pre_v) != ('%.5f' % v):
678 if abs(pre_v
- v
) > 1e-5: # precision is 5 digits
679 write
( "%s\t%d\t%d\t%.5f\n" % ( chrom
.decode
(), pre
, p
, pre_v
) )
684 write
( "%s\t%d\t%d\t%.5f\n" % ( chrom
.decode
(), pre
, p
, pre_v
) )
688 cpdef call_peaks
(self, float32_t cutoff
=5.0, int32_t min_length
=200, int32_t max_gap
=50, bool call_summits
=False
):
689 """This function try to find regions within which, scores
690 are continuously higher than a given cutoff.
692 This function is NOT using sliding-windows. Instead, any
693 regions in bedGraph above certain cutoff will be detected,
694 then merged if the gap between nearby two regions are below
695 max_gap. After this, peak is reported if its length is above
698 cutoff: cutoff of value, default 5. For -log10pvalue, it means 10^-5.
699 min_length : minimum peak length, default 200.
700 max_gap : maximum gap to merge nearby peaks, default 50.
706 np
.ndarray pos
, sample
, control
, value
, above_cutoff
, above_cutoff_v
, above_cutoff_endpos
, above_cutoff_startpos
, above_cutoff_sv
709 chrs
= self.get_chr_names
()
710 peaks
= PeakIO
() # dictionary to save peaks
713 for chrom
in sorted
(chrs
):
714 peak_content
= [] # to store points above cutoff
716 pos
= self.data
[chrom
][ 0 ]
717 sample
= self.data
[chrom
][ 1 ]
718 control
= self.data
[chrom
][ 2 ]
719 value
= self.data
[chrom
][ 3 ]
721 above_cutoff
= np
.nonzero
( value
>= cutoff
)[0] # indices where score is above cutoff
722 above_cutoff_v
= value
[above_cutoff
] # scores where score is above cutoff
724 above_cutoff_endpos
= pos
[above_cutoff
] # end positions of regions where score is above cutoff
725 above_cutoff_startpos
= pos
[above_cutoff
-1] # start positions of regions where score is above cutoff
726 above_cutoff_sv
= sample
[above_cutoff
] # sample pileup height where score is above cutoff
728 if above_cutoff_v
.size
== 0:
729 # nothing above cutoff
732 if above_cutoff
[0] == 0:
733 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
734 above_cutoff_startpos
[0] = 0
736 # first bit of region above cutoff
737 peak_content
.append
( (above_cutoff_startpos
[0], above_cutoff_endpos
[0], above_cutoff_v
[0], above_cutoff_sv
[0], above_cutoff
[0]) )
738 for i
in range( 1,above_cutoff_startpos
.size
):
739 if above_cutoff_startpos
[i
] - peak_content
[-1][1] <= max_gap
:
741 peak_content
.append
( (above_cutoff_startpos
[i
], above_cutoff_endpos
[i
], above_cutoff_v
[i
], above_cutoff_sv
[i
], above_cutoff
[i
]) )
745 self.__close_peak2
(peak_content
, peaks
, min_length
, chrom
, max_gap
//2 )
747 self.__close_peak
(peak_content
, peaks
, min_length
, chrom
)
748 peak_content
= [(above_cutoff_startpos
[i
], above_cutoff_endpos
[i
], above_cutoff_v
[i
], above_cutoff_sv
[i
], above_cutoff
[i
]),]
755 self.__close_peak2
(peak_content
, peaks
, min_length
, chrom
, max_gap
//2 )
757 self.__close_peak
(peak_content
, peaks
, min_length
, chrom
)
761 cdef bool __close_peak
(self, list peak_content
, object peaks
, int32_t min_length
,
763 """Close the peak region, output peak boundaries, peak summit
764 and scores, then add the peak to peakIO object.
766 In this function, we define the peak summit as the middle
767 point of the region with the highest score, in this peak. For
768 example, if the region of the highest score is from 100 to
769 200, the summit is 150. If there are several regions of the
770 same 'highest score', we will first calculate the possible
771 summit for each such region, then pick a position close to the
772 middle index ( = (len(highest_regions) + 1) / 2 ) of these
773 summits. For example, if there are three regions with the same
774 highest scores, [100,200], [300,400], [600,700], we will first
775 find the possible summits as 150, 350, and 650, and then pick
776 the middle index, the 2nd, of the three positions -- 350 as
777 the final summit. If there are four regions, we pick the 2nd
780 peaks: a PeakIO object
784 int32_t summit_pos
, tstart
, tend
, tmpindex
, summit_index
, i
, midindex
785 float32_t summit_value
, tvalue
, tsummitvalue
787 peak_length
= peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ]
788 if peak_length
>= min_length
: # if the peak is too small, reject it
792 for i
in range(len(peak_content
)):
793 (tstart
,tend
,tvalue
,tsummitvalue
, tindex
) = peak_content
[i
]
794 #for (tstart,tend,tvalue,tsummitvalue, tindex) in peak_content:
795 if not summit_value
or summit_value
< tsummitvalue
:
796 tsummit
= [(tend
+ tstart
) / 2, ]
797 tsummit_index
= [ tindex
, ]
798 summit_value
= tsummitvalue
799 elif summit_value
== tsummitvalue
:
800 # remember continuous summit values
801 tsummit
.append
(int((tend
+ tstart
) / 2))
802 tsummit_index
.append
( tindex
)
803 # the middle of all highest points in peak region is defined as summit
804 midindex
= int((len(tsummit
) + 1) / 2) - 1
805 summit_pos
= tsummit
[ midindex
]
806 summit_index
= tsummit_index
[ midindex
]
807 if self.scoring_method
== ord('q'):
808 qscore
= self.data
[chrom
][3][ summit_index
]
810 # if q value is not computed, use -1
817 peak_score
= self.data
[chrom
][ 3 ][ summit_index
],
818 pileup
= self.data
[chrom
][ 1 ][ summit_index
], # should be the same as summit_value
819 pscore
= get_pscore
(self.data
[chrom
][ 1 ][ summit_index
], self.data
[chrom
][ 2 ][ summit_index
]),
820 fold_change
= ( self.data
[chrom
][ 1 ][ summit_index
] + self.pseudocount
) / ( self.data
[chrom
][ 2 ][ summit_index
] + self.pseudocount
),
826 cdef bool __close_peak2
(self, list peak_content
, object peaks
, int32_t min_length
,
827 bytes chrom
, int32_t smoothlen
=51,
828 float32_t min_valley
= 0.9):
829 """Close the peak region, output peak boundaries, peak summit
830 and scores, then add the peak to peakIO object.
832 In this function, we use signal processing methods to smooth
833 the scores in the peak region, find the maxima and enforce the
834 peaky shape, and to define the best maxima as the peak
835 summit. The functions used for signal processing is 'maxima'
836 (with 2nd order polynomial filter) and 'enfoce_peakyness'
837 functions in SignalProcessing.pyx.
839 peaks: a PeakIO object
843 int32_t summit_pos
, tstart
, tend
, tmpindex
, summit_index
, summit_offset
844 int32_t start
, end
, i
, j
, start_boundary
845 float32_t summit_value
, tvalue
, tsummitvalue
846 # np.ndarray[np.float32_t, ndim=1] w
847 np
.ndarray
[np
.float32_t
, ndim
=1] peakdata
848 np
.ndarray
[np
.int32_t
, ndim
=1] peakindices
, summit_offsets
850 # Add 10 bp padding to peak region so that we can get true minima
851 end
= peak_content
[ -1 ][ 1 ] + 10
852 start
= peak_content
[ 0 ][ 0 ] - 10
854 start_boundary
= 10 + start
858 peak_length
= end
- start
859 if end
- start
< min_length
: return # if the region is too small, reject it
861 peakdata
= np
.zeros
(end
- start
, dtype
='float32')
862 peakindices
= np
.zeros
(end
- start
, dtype
='int32')
863 for (tstart
,tend
,tvalue
,tsvalue
, tmpindex
) in peak_content
:
864 i
= tstart
- start
+ start_boundary
865 j
= tend
- start
+ start_boundary
866 peakdata
[i
:j
] = tsvalue
867 peakindices
[i
:j
] = tmpindex
868 summit_offsets
= maxima
(peakdata
, smoothlen
)
869 if summit_offsets
.shape
[0] == 0:
870 # **failsafe** if no summits, fall back on old approach #
871 return self.__close_peak
(peak_content
, peaks
, min_length
, chrom
)
873 # remove maxima that occurred in padding
874 i
= np
.searchsorted
(summit_offsets
, start_boundary
)
875 j
= np
.searchsorted
(summit_offsets
, peak_length
+ start_boundary
, 'right')
876 summit_offsets
= summit_offsets
[i
:j
]
878 summit_offsets
= enforce_peakyness
(peakdata
, summit_offsets
)
879 if summit_offsets
.shape
[0] == 0:
880 # **failsafe** if no summits, fall back on old approach #
881 return self.__close_peak
(peak_content
, peaks
, min_length
, chrom
)
883 summit_indices
= peakindices
[summit_offsets
]
884 summit_offsets
-= start_boundary
886 peak_scores
= self.data
[chrom
][3][ summit_indices
]
887 if not (peak_scores
> self.cutoff
).all
():
888 return self.__close_peak
(peak_content
, peaks
, min_length
, chrom
)
889 for summit_offset
, summit_index
in zip
(summit_offsets
, summit_indices
):
890 if self.scoring_method
== ord('q'):
891 qscore
= self.data
[chrom
][3][ summit_index
]
893 # if q value is not computed, use -1
898 summit
= start
+ summit_offset
,
899 peak_score
= self.data
[chrom
][3][ summit_index
],
900 pileup
= self.data
[chrom
][1][ summit_index
], # should be the same as summit_value
901 pscore
= get_pscore
(self.data
[chrom
][ 1 ][ summit_index
], self.data
[chrom
][ 2 ][ summit_index
]),
902 fold_change
= ( self.data
[chrom
][ 1 ][ summit_index
] + self.pseudocount
) / ( self.data
[chrom
][ 2 ][ summit_index
] + self.pseudocount
),
908 cdef int64_t total
( self ):
909 """Return the number of regions in this object.
917 for chrom
in sorted
(self.data
.keys
()):
918 t
+= self.datalength
[chrom
]
921 cpdef call_broadpeaks
(self, float32_t lvl1_cutoff
=5.0, float32_t lvl2_cutoff
=1.0, int32_t min_length
=200, int32_t lvl1_max_gap
=50, int32_t lvl2_max_gap
=400):
922 """This function try to find enriched regions within which,
923 scores are continuously higher than a given cutoff for level
924 1, and link them using the gap above level 2 cutoff with a
925 maximum length of lvl2_max_gap.
927 lvl1_cutoff: cutoff of value at enriched regions, default 5.0.
928 lvl2_cutoff: cutoff of value at linkage regions, default 1.0.
929 min_length : minimum peak length, default 200.
930 lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50.
931 lvl2_max_gap : maximum length of linkage regions, default 400.
933 Return both general PeakIO object for highly enriched regions
934 and gapped broad regions in BroadPeakIO.
940 assert lvl1_cutoff
> lvl2_cutoff
, "level 1 cutoff should be larger than level 2."
941 assert lvl1_max_gap
< lvl2_max_gap
, "level 2 maximum gap should be larger than level 1."
942 lvl1_peaks
= self.call_peaks
(cutoff
=lvl1_cutoff
, min_length
=min_length
, max_gap
=lvl1_max_gap
)
943 lvl2_peaks
= self.call_peaks
(cutoff
=lvl2_cutoff
, min_length
=min_length
, max_gap
=lvl2_max_gap
)
944 chrs
= lvl1_peaks
.peaks
.keys
()
945 broadpeaks
= BroadPeakIO
()
946 # use lvl2_peaks as linking regions between lvl1_peaks
947 for chrom
in sorted
(chrs
):
948 lvl1peakschrom
= lvl1_peaks
.peaks
[chrom
]
949 lvl2peakschrom
= lvl2_peaks
.peaks
[chrom
]
950 lvl1peakschrom_next
= iter
(lvl1peakschrom
).__next__
951 tmppeakset
= [] # to temporarily store lvl1 region inside a lvl2 region
952 # our assumption is lvl1 regions should be included in lvl2 regions
954 lvl1
= lvl1peakschrom_next
()
955 for i
in range( len(lvl2peakschrom
) ):
956 # for each lvl2 peak, find all lvl1 peaks inside
957 # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
958 lvl2
= lvl2peakschrom
[i
]
961 if lvl2
["start"] <= lvl1
["start"] and lvl1
["end"] <= lvl2
["end"]:
962 tmppeakset
.append
(lvl1
)
963 lvl1
= lvl1peakschrom_next
()
965 # make a hierarchical broad peak
966 #print lvl2["start"], lvl2["end"], lvl2["score"]
967 self.__add_broadpeak
( broadpeaks
, chrom
, lvl2
, tmppeakset
)
970 except StopIteration
:
971 # no more strong (aka lvl1) peaks left
972 self.__add_broadpeak
( broadpeaks
, chrom
, lvl2
, tmppeakset
)
974 # add the rest lvl2 peaks
975 for j
in range( i
+1, len(lvl2peakschrom
) ):
976 self.__add_broadpeak
( broadpeaks
, chrom
, lvl2peakschrom
[j
], tmppeakset
)
980 def __add_broadpeak
(self, bpeaks
, bytes chrom
, dict lvl2peak
, list lvl1peakset
):
981 """Internal function to create broad peak.
985 int32_t blockNum
, thickStart
, thickEnd
, start
, end
986 bytes blockSizes
, blockStarts
988 start
= lvl2peak
["start"]
989 end
= lvl2peak
["end"]
991 # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
993 # will complement by adding 1bps start and end to this region
994 # may change in the future if gappedPeak format was improved.
995 bpeaks
.add
(chrom
, start
, end
, score
=lvl2peak
["score"], thickStart
=(b
"%d" % start
), thickEnd
=(b
"%d" % end
),
996 blockNum
= 2, blockSizes
= b
"1,1", blockStarts
= (b
"0,%d" % (end
-start
-1)), pileup
= lvl2peak
["pileup"],
997 pscore
= lvl2peak
["pscore"], fold_change
= lvl2peak
["fc"],
998 qscore
= lvl2peak
["qscore"] )
1001 thickStart
= b
"%d" % lvl1peakset
[0]["start"]
1002 thickEnd
= b
"%d" % lvl1peakset
[-1]["end"]
1003 blockNum
= int(len(lvl1peakset
))
1004 blockSizes
= b
",".join
( [b
"%d" % x
["length"] for x
in lvl1peakset
] )
1005 blockStarts
= b
",".join
( [b
"%d" % (x
["start"]-start
) for x
in lvl1peakset
] )
1007 if lvl2peak
["start"] != thickStart
:
1008 # add 1bp mark for the start of lvl2 peak
1009 thickStart
= b
"%d" % start
1011 blockSizes
= b
"1,"+blockSizes
1012 blockStarts
= b
"0,"+blockStarts
1013 if lvl2peak
["end"] != thickEnd
:
1014 # add 1bp mark for the end of lvl2 peak
1015 thickEnd
= b
"%d" % end
1017 blockSizes
= blockSizes
+b
",1"
1018 blockStarts
= blockStarts
+ b
"," + (b
"%d" % (end
-start
-1))
1020 # add to BroadPeakIO object
1021 bpeaks
.add
(chrom
, start
, end
, score
=lvl2peak
["score"], thickStart
=thickStart
, thickEnd
=thickEnd
,
1022 blockNum
= blockNum
, blockSizes
= blockSizes
, blockStarts
= blockStarts
, pileup
= lvl2peak
["pileup"],
1023 pscore
= lvl2peak
["pscore"], fold_change
= lvl2peak
["fc"],
1024 qscore
= lvl2peak
["qscore"] )
1027 cdef class TwoConditionScores
:
1028 """Class for saving two condition comparison scores.
1031 dict data
# dictionary for data of each chromosome
1032 dict datalength
# length of data array of each chromosome
1033 float32_t cond1_factor
# factor to apply to cond1 pileup values
1034 float32_t cond2_factor
# factor to apply to cond2 pileup values
1035 float32_t pseudocount
# the pseudocount used to calcuate LLR
1037 object t1bdg
, c1bdg
, t2bdg
, c2bdg
1038 dict pvalue_stat1
, pvalue_stat2
, pvalue_stat3
1040 def __init__
(self, t1bdg
, c1bdg
, t2bdg
, c2bdg
, float32_t cond1_factor
= 1.0, float32_t cond2_factor
= 1.0, float32_t pseudocount
= 0.01, float32_t proportion_background_empirical_distribution
= 0.99999 ):
1042 t1bdg: a bedGraphTrackI object for treat 1
1043 c1bdg: a bedGraphTrackI object for control 1
1044 t2bdg: a bedGraphTrackI object for treat 2
1045 c2bdg: a bedGraphTrackI object for control 2
1047 cond1_factor: this will be multiplied to values in t1bdg and c1bdg
1048 cond2_factor: this will be multiplied to values in t2bdg and c2bdg
1050 pseudocount: pseudocount, by default 0.01.
1052 proportion_background_empirical_distribution: proportion of genome as the background to build empirical distribution
1056 self.data
= {} # for each chromosome, there is a l*4
1057 # matrix. First column: end position
1058 # of a region; Second: treatment
1059 # pileup; third: control pileup ;
1060 # forth: score ( can be
1061 # p/q-value/likelihood
1062 # ratio/fold-enrichment/subtraction
1063 # depending on -c setting)
1064 self.datalength
= {}
1065 self.cond1_factor
= cond1_factor
1066 self.cond2_factor
= cond2_factor
1067 self.pseudocount
= pseudocount
1068 self.pvalue_stat1
= {}
1069 self.pvalue_stat2
= {}
1075 #self.empirical_distr_llr = [] # save all values in histogram
1077 cpdef set_pseudocount
( self, float32_t pseudocount
):
1078 self.pseudocount
= pseudocount
1080 cpdef build
( self ):
1081 """Compute scores from 3 types of comparisons and store them in self.data.
1087 int32_t chrom_max_len
1088 # common chromosome names
1089 common_chrs
= self.get_common_chrs
()
1090 for chrname
in common_chrs
:
1091 (cond1_treat_ps
, cond1_treat_vs
) = self.t1bdg
.get_data_by_chr
(chrname
)
1092 (cond1_control_ps
, cond1_control_vs
) = self.c1bdg
.get_data_by_chr
(chrname
)
1093 (cond2_treat_ps
, cond2_treat_vs
) = self.t2bdg
.get_data_by_chr
(chrname
)
1094 (cond2_control_ps
, cond2_control_vs
) = self.c2bdg
.get_data_by_chr
(chrname
)
1095 chrom_max_len
= len(cond1_treat_ps
) + len(cond1_control_ps
) +\
1096 len(cond2_treat_ps
) + len(cond2_control_ps
)
1097 self.add_chromosome
( chrname
, chrom_max_len
)
1098 self.build_chromosome
( chrname
,
1099 cond1_treat_ps
, cond1_control_ps
,
1100 cond2_treat_ps
, cond2_control_ps
,
1101 cond1_treat_vs
, cond1_control_vs
,
1102 cond2_treat_vs
, cond2_control_vs
)
1105 cdef build_chromosome
( self, chrname
,
1106 cond1_treat_ps
, cond1_control_ps
,
1107 cond2_treat_ps
, cond2_control_ps
,
1108 cond1_treat_vs
, cond1_control_vs
,
1109 cond2_treat_vs
, cond2_control_vs
):
1110 """Internal function to calculate scores for three types of comparisons.
1112 cond1_treat_ps, cond1_control_ps: position of treat and control of condition 1
1113 cond2_treat_ps, cond2_control_ps: position of treat and control of condition 2
1114 cond1_treat_vs, cond1_control_vs: value of treat and control of condition 1
1115 cond2_treat_vs, cond2_control_vs: value of treat and control of condition 2
1119 int32_t c1tp
, c1cp
, c2tp
, c2cp
, minp
, pre_p
1120 float32_t c1tv
, c1cv
, c2tv
, c2cv
1121 c1tpn
= iter
(cond1_treat_ps
).__next__
1122 c1cpn
= iter
(cond1_control_ps
).__next__
1123 c2tpn
= iter
(cond2_treat_ps
).__next__
1124 c2cpn
= iter
(cond2_control_ps
).__next__
1125 c1tvn
= iter
(cond1_treat_vs
).__next__
1126 c1cvn
= iter
(cond1_control_vs
).__next__
1127 c2tvn
= iter
(cond2_treat_vs
).__next__
1128 c2cvn
= iter
(cond2_control_vs
).__next__
1146 minp
= min(c1tp
, c1cp
, c2tp
, c2cp
)
1147 self.add
( chrname
, pre_p
, c1tv
, c1cv
, c2tv
, c2cv
)
1161 except StopIteration
:
1162 # meet the end of either bedGraphTrackI, simply exit
1166 cdef set get_common_chrs
( self ):
1168 set t1chrs
, c1chrs
, t2chrs
, c2chrs
, common
1169 t1chrs
= self.t1bdg
.get_chr_names
()
1170 c1chrs
= self.c1bdg
.get_chr_names
()
1171 t2chrs
= self.t2bdg
.get_chr_names
()
1172 c2chrs
= self.c2bdg
.get_chr_names
()
1173 common
= reduce(lambda x
,y
:x
.intersection
(y
), (t1chrs
,c1chrs
,t2chrs
,c2chrs
))
1176 cdef add_chromosome
( self, bytes chrom
, int32_t chrom_max_len
):
1178 chrom: chromosome name
1179 chrom_max_len: maximum number of data points in this chromosome
1182 if chrom
not in self.data
:
1183 self.data
[chrom
] = [ np
.zeros
( chrom_max_len
, dtype
="int32" ), # pos
1184 np
.zeros
( chrom_max_len
, dtype
="float32" ), # LLR t1 vs c1
1185 np
.zeros
( chrom_max_len
, dtype
="float32" ), # LLR t2 vs c2
1186 np
.zeros
( chrom_max_len
, dtype
="float32" )] # LLR t1 vs t2
1187 self.datalength
[chrom
] = 0
1189 cdef add
(self, bytes chromosome
, int32_t endpos
, float32_t t1
, float32_t c1
, float32_t t2
, float32_t c2
):
1190 """Take chr-endpos-sample1-control1-sample2-control2 and
1191 compute logLR for t1 vs c1, t2 vs c2, and t1 vs t2, then save
1194 chromosome: chromosome name in string
1195 endpos : end position of each interval in integer
1196 t1 : Sample 1 ChIP pileup value of each interval in float
1197 c1 : Sample 1 Control pileup value of each interval in float
1198 t2 : Sample 2 ChIP pileup value of each interval in float
1199 c2 : Sample 2 Control pileup value of each interval in float
1201 *Warning* Need to add regions continuously.
1206 i
= self.datalength
[chromosome
]
1207 c
= self.data
[chromosome
]
1209 c
[1][ i
] = logLR_asym
( (t1
+self.pseudocount
)*self.cond1_factor
, (c1
+self.pseudocount
)*self.cond1_factor
)
1210 c
[2][ i
] = logLR_asym
( (t2
+self.pseudocount
)*self.cond2_factor
, (c2
+self.pseudocount
)*self.cond2_factor
)
1211 c
[3][ i
] = logLR_sym
( (t1
+self.pseudocount
)*self.cond1_factor
, (t2
+self.pseudocount
)*self.cond2_factor
)
1212 self.datalength
[chromosome
] += 1
1215 cpdef finalize
( self ):
1217 Adjust array size of each chromosome.
1225 for chrom
in sorted
(self.data
.keys
()):
1226 d
= self.data
[chrom
]
1227 l
= self.datalength
[chrom
]
1228 d
[0].resize
( l
, refcheck
= False
)
1229 d
[1].resize
( l
, refcheck
= False
)
1230 d
[2].resize
( l
, refcheck
= False
)
1231 d
[3].resize
( l
, refcheck
= False
)
1234 cpdef get_data_by_chr
(self, bytes chromosome
):
1235 """Return array of counts by chromosome.
1237 The return value is a tuple:
1240 if chromosome
in self.data
:
1241 return self.data
[chromosome
]
1245 cpdef get_chr_names
(self):
1246 """Return all the chromosome names stored.
1249 l
= set
(self.data
.keys
())
1252 cpdef write_bedGraph
( self, fhd
, str name
, str description
, int32_t column
= 3):
1253 """Write all data to fhd in bedGraph Format.
1255 fhd: a filehandler to save bedGraph.
1257 name/description: the name and description in track line.
1259 colname: can be 1: cond1 chip vs cond1 ctrl, 2: cond2 chip vs cond2 ctrl, 3: cond1 chip vs cond2 chip
1264 int32_t l
, pre
, i
, p
1266 np
.ndarray pos
, value
1268 assert column
in range( 1, 4 ), "column should be between 1, 2 or 3."
1273 # # this line is REQUIRED by the wiggle format for UCSC browser
1274 # write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
1276 chrs
= self.get_chr_names
()
1277 for chrom
in sorted
(chrs
):
1278 pos
= self.data
[ chrom
][ 0 ]
1279 value
= self.data
[ chrom
][ column
]
1280 l
= self.datalength
[ chrom
]
1282 if pos
.shape
[ 0 ] == 0: continue # skip if there's no data
1284 for i
in range( 1, l
):
1287 if abs(pre_v
- v
)>=1e-6:
1288 write
( "%s\t%d\t%d\t%.5f\n" % ( chrom
.decode
(), pre
, p
, pre_v
) )
1293 write
( "%s\t%d\t%d\t%.5f\n" % ( chrom
.decode
(), pre
, p
, pre_v
) )
1297 cpdef write_matrix
( self, fhd
, str name
, str description
):
1298 """Write all data to fhd into five columns Format:
1305 fhd: a filehandler to save the matrix.
1310 int32_t l
, pre
, i
, p
1311 float32_t v1
, v2
, v3
1312 np
.ndarray pos
, value1
, value2
, value3
1316 chrs
= self.get_chr_names
()
1317 for chrom
in sorted
(chrs
):
1318 [ pos
, value1
, value2
, value3
] = self.data
[ chrom
]
1319 l
= self.datalength
[ chrom
]
1321 if pos
.shape
[ 0 ] == 0: continue # skip if there's no data
1322 for i
in range( 0, l
):
1327 write
( "%s:%d_%d\t%.5f\t%.5f\t%.5f\n" % ( chrom
.decode
(), pre
, p
, v1
, v2
, v3
) )
1332 cpdef
tuple call_peaks
(self, float32_t cutoff
=3, int32_t min_length
=200, int32_t max_gap
= 100,
1333 bool call_summits
=False
):
1334 """This function try to find regions within which, scores
1335 are continuously higher than a given cutoff.
1339 This function is NOT using sliding-windows. Instead, any
1340 regions in bedGraph above certain cutoff will be detected,
1341 then merged if the gap between nearby two regions are below
1342 max_gap. After this, peak is reported if its length is above
1345 cutoff: cutoff of value, default 3. For log10 LR, it means 1000 or -1000.
1346 min_length : minimum peak length, default 200.
1347 max_gap : maximum gap to merge nearby peaks, default 100.
1348 ptrack: an optional track for pileup heights. If it's not None, use it to find summits. Otherwise, use self/scoreTrack.
1353 np
.ndarray pos
, t1_vs_c1
, t2_vs_c2
, t1_vs_t2
, \
1354 cond1_over_cond2
, cond2_over_cond1
, cond1_equal_cond2
, \
1355 cond1_sig
, cond2_sig
,\
1357 cat1_startpos
, cat1_endpos
, cat2_startpos
, cat2_endpos
, \
1358 cat3_startpos
, cat3_endpos
1359 chrs
= self.get_chr_names
()
1360 cat1_peaks
= PeakIO
() # dictionary to save peaks significant at condition 1
1361 cat2_peaks
= PeakIO
() # dictionary to save peaks significant at condition 2
1362 cat3_peaks
= PeakIO
() # dictionary to save peaks significant in both conditions
1364 self.cutoff
= cutoff
1366 for chrom
in sorted
(chrs
):
1367 pos
= self.data
[chrom
][ 0 ]
1368 t1_vs_c1
= self.data
[chrom
][ 1 ]
1369 t2_vs_c2
= self.data
[chrom
][ 2 ]
1370 t1_vs_t2
= self.data
[chrom
][ 3 ]
1371 and_
= np
.logical_and
1372 cond1_over_cond2
= t1_vs_t2
>= cutoff
# regions with stronger cond1 signals
1373 cond2_over_cond1
= t1_vs_t2
<= -1*cutoff
# regions with stronger cond2 signals
1374 cond1_equal_cond2
= and_
( t1_vs_t2
>= -1*cutoff
, t1_vs_t2
<= cutoff
)
1375 cond1_sig
= t1_vs_c1
>= cutoff
# enriched regions in condition 1
1376 cond2_sig
= t2_vs_c2
>= cutoff
# enriched regions in condition 2
1377 # indices where score is above cutoff
1378 cat1
= np
.where
( and_
( cond1_sig
, cond1_over_cond2
) )[ 0 ] # cond1 stronger than cond2, the indices
1379 cat2
= np
.where
( and_
( cond2_over_cond1
, cond2_sig
) )[ 0 ] # cond2 stronger than cond1, the indices
1380 cat3
= np
.where
( and_
( and_
( cond1_sig
, cond2_sig
), # cond1 and cond2 are equal, the indices
1381 cond1_equal_cond2
) ) [ 0 ]
1383 cat1_endpos
= pos
[cat1
] # end positions of regions where score is above cutoff
1384 cat1_startpos
= pos
[cat1
-1] # start positions of regions where score is above cutoff
1385 cat2_endpos
= pos
[cat2
] # end positions of regions where score is above cutoff
1386 cat2_startpos
= pos
[cat2
-1] # start positions of regions where score is above cutoff
1387 cat3_endpos
= pos
[cat3
] # end positions of regions where score is above cutoff
1388 cat3_startpos
= pos
[cat3
-1] # start positions of regions where score is above cutoff
1390 # for cat1: condition 1 stronger regions
1391 self.__add_a_peak
( cat1_peaks
, chrom
, cat1
, cat1_startpos
, cat1_endpos
, t1_vs_t2
, max_gap
, min_length
)
1392 # for cat2: condition 2 stronger regions
1393 self.__add_a_peak
( cat2_peaks
, chrom
, cat2
, cat2_startpos
, cat2_endpos
, -1 * t1_vs_t2
, max_gap
, min_length
)
1394 # for cat3: commonly strong regions
1395 self.__add_a_peak
( cat3_peaks
, chrom
, cat3
, cat3_startpos
, cat3_endpos
, abs(t1_vs_t2
), max_gap
, min_length
)
1397 return (cat1_peaks
, cat2_peaks
, cat3_peaks
)
1399 cdef object __add_a_peak
( self, object peaks
, bytes chrom
, np
.ndarray indices
, np
.ndarray startpos
, np
.ndarray endpos
,
1400 np
.ndarray score
, int32_t max_gap
, int32_t min_length
):
1401 """For a given chromosome, merge nearby significant regions,
1402 filter out smaller regions, then add regions to PeakIO
1409 float32_t mean_logLR
1411 if startpos
.size
> 0:
1412 # if it is not empty
1415 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1417 # first bit of region above cutoff
1418 peak_content
.append
( (startpos
[0], endpos
[0], score
[indices
[ 0 ]]) )
1419 for i
in range( 1, startpos
.size
):
1420 if startpos
[i
] - peak_content
[-1][1] <= max_gap
:
1422 peak_content
.append
( ( startpos
[i
], endpos
[i
], score
[indices
[ i
]] ) )
1425 if peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ] >= min_length
:
1426 mean_logLR
= self.mean_from_peakcontent
( peak_content
)
1427 #if peak_content[0][0] == 22414956:
1428 # print(f"{peak_content} {mean_logLR}")
1429 peaks
.add
( chrom
, peak_content
[0][0], peak_content
[-1][1],
1430 summit
= -1, peak_score
= mean_logLR
, pileup
= 0, pscore
= 0,
1431 fold_change
= 0, qscore
= 0,
1433 peak_content
= [(startpos
[i
], endpos
[i
], score
[ indices
[ i
] ]),]
1435 # save the last peak
1437 if peak_content
[ -1 ][ 1 ] - peak_content
[ 0 ][ 0 ] >= min_length
:
1438 mean_logLR
= self.mean_from_peakcontent
( peak_content
)
1439 peaks
.add
( chrom
, peak_content
[0][0], peak_content
[-1][1],
1440 summit
= -1, peak_score
= mean_logLR
, pileup
= 0, pscore
= 0,
1441 fold_change
= 0, qscore
= 0,
1446 cdef float32_t mean_from_peakcontent
( self, list peakcontent
):
1451 int32_t tmp_s
, tmp_e
1453 float64_t tmp_v
, sum_v
#for better precision
1458 sum_v
= 0 #initialize sum_v as 0
1459 for i
in range( len(peakcontent
) ):
1460 tmp_s
= peakcontent
[i
][0]
1461 tmp_e
= peakcontent
[i
][1]
1462 tmp_v
= peakcontent
[i
][2]
1463 sum_v
+= tmp_v
* ( tmp_e
- tmp_s
)
1466 r
= <float32_t
>( sum_v
/ l
)
1470 cdef int64_t total
( self ):
1471 """Return the number of regions in this object.
1479 for chrom
in sorted
(self.data
.keys
()):
1480 t
+= self.datalength
[chrom
]