implement PETrackII for fragment files from scATAC experiments
[MACS.git] / MACS3 / Signal / ScoreTrack.pyx
blob1ef3d31b4e95ac7b815b0c224d36862e63841d5e
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2024-10-10 16:45:13 Tao Liu>
5 """Module for Feature IO classes.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
9 the distribution).
10 """
12 # ------------------------------------
13 # python modules
14 # ------------------------------------
15 from copy import copy
16 from functools import reduce
18 # ------------------------------------
19 # MACS3 modules
20 # ------------------------------------
21 from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness
22 from MACS3.Signal.Prob import poisson_cdf
23 from MACS3.IO.PeakIO import PeakIO, BroadPeakIO
25 # ------------------------------------
26 # Other modules
27 # ------------------------------------
28 cimport cython
29 import numpy as np
30 cimport numpy as np
31 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
32 from cpython cimport bool
33 from cykhash import PyObjectMap, Float32to32Map
35 # ------------------------------------
36 # C lib
37 # ------------------------------------
38 from libc.math cimport log10,log, floor, ceil
40 # ------------------------------------
41 # constants
42 # ------------------------------------
43 __version__ = "scoreTrack $Revision$"
44 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
45 __doc__ = "scoreTrack classes"
47 # ------------------------------------
48 # Misc functions
49 # ------------------------------------
50 cdef inline int32_t int_max(int32_t a, int32_t b): return a if a >= b else b
51 cdef inline int32_t int_min(int32_t a, int32_t b): return a if a <= b else b
53 LOG10_E = 0.43429448190325176
55 pscore_dict = PyObjectMap()
57 cdef float32_t get_pscore ( int32_t observed, float32_t expectation ):
58 """Get p-value score from Poisson test. First check existing
59 table, if failed, call poisson_cdf function, then store the result
60 in table.
62 """
63 cdef:
64 float64_t score
66 try:
67 return pscore_dict[(observed, expectation)]
68 except KeyError:
69 score = -1*poisson_cdf(observed,expectation,False,True)
70 pscore_dict[(observed, expectation)] = score
71 return score
73 asym_logLR_dict = PyObjectMap()
75 cdef float32_t logLR_asym ( float32_t x, float32_t y ):
76 """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
77 chromatin bias ). Set minus sign for depletion.
79 *asymmetric version*
81 """
82 cdef:
83 float32_t s
85 if (x,y) in asym_logLR_dict:
86 return asym_logLR_dict[ ( x, y ) ]
87 else:
88 if x > y:
89 s = (x*(log(x)-log(y))+y-x)*LOG10_E
90 elif x < y:
91 s = (x*(-log(x)+log(y))-y+x)*LOG10_E
92 else:
93 s = 0
94 asym_logLR_dict[ ( x, y ) ] = s
95 return s
97 sym_logLR_dict = PyObjectMap()
99 cdef float32_t logLR_sym ( float32_t x, float32_t y ):
100 """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
101 another enriched ). Set minus sign for H0>H1.
103 * symmetric version *
106 cdef:
107 float32_t s
109 if (x,y) in sym_logLR_dict:
110 return sym_logLR_dict[ ( x, y ) ]
111 else:
112 if x > y:
113 s = (x*(log(x)-log(y))+y-x)*LOG10_E
114 elif y > x:
115 s = (y*(log(x)-log(y))+y-x)*LOG10_E
116 else:
117 s = 0
118 sym_logLR_dict[ ( x, y ) ] = s
119 return s
121 cdef float32_t get_logFE ( float32_t x, float32_t y ):
122 """ return 100* log10 fold enrichment with +1 pseudocount.
124 return log10( x/y )
126 cdef float32_t get_subtraction ( float32_t x, float32_t y):
127 """ return subtraction.
129 return x - y
131 # ------------------------------------
132 # Classes
133 # ------------------------------------
135 cdef class ScoreTrackII:
136 """Class for a container to keep signals of each genomic position,
137 including 1. score, 2. treatment and 2. control pileup.
139 It also contains scoring methods and call_peak functions.
141 cdef:
142 dict data # dictionary for data of each chromosome
143 dict datalength # length of data array of each chromosome
144 bool trackline # whether trackline should be saved in bedGraph
145 float32_t treat_edm # seq depth in million of treatment
146 float32_t ctrl_edm # seq depth in million of control
147 char scoring_method # method for calculating scores.
148 char normalization_method # scale to control? scale to treatment? both scale to 1million reads?
149 float32_t pseudocount # the pseudocount used to calcuate logLR, FE or logFE
150 float32_t cutoff
151 dict pvalue_stat # save pvalue<->length dictionary
154 def __init__ (self, float32_t treat_depth, float32_t ctrl_depth, float32_t pseudocount = 1.0 ):
155 """Initialize.
157 treat_depth and ctrl_depth are effective depth in million:
158 sequencing depth in million after
159 duplicates being filtered. If
160 treatment is scaled down to
161 control sample size, then this
162 should be control sample size in
163 million. And vice versa.
165 pseudocount: a pseudocount used to calculate logLR, FE or
166 logFE. Please note this value will not be changed
167 with normalization method. So if you really want
168 to set pseudocount 1 per million reads, set it
169 after you normalize treat and control by million
170 reads by `change_normalizetion_method(ord('M'))`.
173 self.data = {} # for each chromosome, there is a l*4
174 # matrix. First column: end position
175 # of a region; Second: treatment
176 # pileup; third: control pileup ;
177 # forth: score ( can be
178 # p/q-value/likelihood
179 # ratio/fold-enrichment/subtraction
180 # depending on -c setting)
181 self.datalength = {}
182 self.trackline = False
183 self.treat_edm = treat_depth
184 self.ctrl_edm = ctrl_depth
185 #scoring_method: p: -log10 pvalue;
186 # q: -log10 qvalue;
187 # l: log10 likelihood ratio ( minus for depletion )
188 # f: log10 fold enrichment
189 # F: linear fold enrichment
190 # d: subtraction
191 # m: fragment pileup per million reads
192 # N: not set
193 self.scoring_method = ord("N")
195 #normalization_method: T: scale to depth of treatment;
196 # C: scale to depth of control;
197 # M: scale to depth of 1 million;
198 # N: not set/ raw pileup
199 self.normalization_method = ord("N")
201 self.pseudocount = pseudocount
202 self.pvalue_stat = {}
204 cpdef set_pseudocount( self, float32_t pseudocount ):
205 self.pseudocount = pseudocount
207 cpdef enable_trackline( self ):
208 """Turn on trackline with bedgraph output
210 self.trackline = True
212 cpdef add_chromosome ( self, bytes chrom, int32_t chrom_max_len ):
214 chrom: chromosome name
215 chrom_max_len: maximum number of data points in this chromosome
218 if chrom not in self.data:
219 #self.data[chrom] = np.zeros( ( chrom_max_len, 4 ), dtype="int32" ) # remember col #2-4 is actual value * 100, I use integer here.
220 self.data[chrom] = [ np.zeros( chrom_max_len, dtype="int32" ), # pos
221 np.zeros( chrom_max_len, dtype="float32" ), # pileup at each interval, in float32 format
222 np.zeros( chrom_max_len, dtype="float32" ), # control at each interval, in float32 format
223 np.zeros( chrom_max_len, dtype="float32" ) ] # score at each interval, in float32 format
224 self.datalength[chrom] = 0
226 cpdef add (self, bytes chromosome, int32_t endpos, float32_t chip, float32_t control):
227 """Add a chr-endpos-sample-control block into data
228 dictionary.
230 chromosome: chromosome name in string
231 endpos : end position of each interval in integer
232 chip : ChIP pileup value of each interval in float
233 control : Control pileup value of each interval in float
235 *Warning* Need to add regions continuously.
237 cdef int32_t i
238 i = self.datalength[chromosome]
239 c = self.data[chromosome]
240 c[0][ i ] = endpos
241 c[1][ i ] = chip
242 c[2][ i ] = control
243 self.datalength[chromosome] += 1
245 cpdef finalize ( self ):
247 Adjust array size of each chromosome.
250 cdef:
251 bytes chrom, k
252 int32_t l
254 for chrom in sorted(self.data.keys()):
255 d = self.data[chrom]
256 l = self.datalength[chrom]
257 d[0].resize( l, refcheck = False )
258 d[1].resize( l, refcheck = False )
259 d[2].resize( l, refcheck = False )
260 d[3].resize( l, refcheck = False )
261 return
263 cpdef get_data_by_chr (self, bytes chromosome):
264 """Return array of counts by chromosome.
266 The return value is a tuple:
267 ([end pos],[value])
269 if chromosome in self.data:
270 return self.data[chromosome]
271 else:
272 return None
274 cpdef get_chr_names (self):
275 """Return all the chromosome names stored.
278 l = set(self.data.keys())
279 return l
281 cpdef change_normalization_method ( self, char normalization_method ):
282 """Change/set normalization method. However, I do not
283 recommend change this back and forward, since some precision
284 issue will happen -- I only keep two digits.
286 normalization_method: T: scale to depth of treatment;
287 C: scale to depth of control;
288 M: scale to depth of 1 million;
289 N: not set/ raw pileup
291 if normalization_method == ord('T'):
292 if self.normalization_method == ord('T'): # do nothing
293 pass
294 elif self.normalization_method == ord('C'):
295 self.normalize( self.treat_edm/self.ctrl_edm, self.treat_edm/self.ctrl_edm )
296 elif self.normalization_method == ord('M'):
297 self.normalize( self.treat_edm, self.treat_edm )
298 elif self.normalization_method == ord('N'):
299 self.normalize( 1, self.treat_edm/self.ctrl_edm )
300 else:
301 raise NotImplemented
302 self.normalization_method = ord('T')
303 elif normalization_method == ord('C'):
304 if self.normalization_method == ord('T'):
305 self.normalize( self.ctrl_edm/self.treat_edm, self.ctrl_edm/self.treat_edm )
306 elif self.normalization_method == ord('C'): # do nothing
307 pass
308 elif self.normalization_method == ord('M'):
309 self.normalize( self.ctrl_edm, self.ctrl_edm )
310 elif self.normalization_method == ord('N'):
311 self.normalize( self.ctrl_edm/self.treat_edm, 1 )
312 else:
313 raise NotImplemented
314 self.normalization_method = ord('C')
315 elif normalization_method == ord('M'):
316 if self.normalization_method == ord('T'):
317 self.normalize( 1/self.treat_edm, 1/self.treat_edm )
318 elif self.normalization_method == ord('C'):
319 self.normalize( 1/self.ctrl_edm, 1/self.ctrl_edm )
320 elif self.normalization_method == ord('M'): # do nothing
321 pass
322 elif self.normalization_method == ord('N'):
323 self.normalize( 1/self.treat_edm, 1/self.ctrl_edm )
324 else:
325 raise NotImplemented
326 self.normalization_method = ord('M')
327 elif normalization_method == ord('N'):
328 if self.normalization_method == ord('T'):
329 self.normalize( self.treat_edm, self.treat_edm )
330 elif self.normalization_method == ord('C'):
331 self.normalize( self.ctrl_edm, self.ctrl_edm )
332 elif self.normalization_method == ord('M'):
333 self.normalize( self.treat_edm, self.ctrl_edm )
334 elif self.normalization_method == ord('N'): # do nothing
335 pass
336 else:
337 raise NotImplemented
338 self.normalization_method = ord('N')
340 cdef normalize ( self, float32_t treat_scale, float32_t control_scale ):
341 cdef:
342 np.ndarray p, c
343 int64_t l, i
345 for chrom in sorted(self.data.keys()):
346 p = self.data[chrom][1]
347 c = self.data[chrom][2]
348 l = self.datalength[chrom]
349 for i in range(l):
350 p[ i ] *= treat_scale
351 c[ i ] *= control_scale
352 return
354 cpdef change_score_method (self, char scoring_method):
356 scoring_method: p: -log10 pvalue;
357 q: -log10 qvalue;
358 l: log10 likelihood ratio ( minus for depletion )
359 s: symmetric log10 likelihood ratio ( for comparing two ChIPs )
360 f: log10 fold enrichment
361 F: linear fold enrichment
362 d: subtraction
363 M: maximum
364 m: fragment pileup per million reads
366 if scoring_method == ord('p'):
367 self.compute_pvalue()
368 elif scoring_method == ord('q'):
369 #if not already calculated p, compute pvalue first
370 if self.scoring_method != ord('p'):
371 self.compute_pvalue()
372 self.compute_qvalue()
373 elif scoring_method == ord('l'):
374 self.compute_likelihood()
375 elif scoring_method == ord('s'):
376 self.compute_sym_likelihood()
377 elif scoring_method == ord('f'):
378 self.compute_logFE()
379 elif scoring_method == ord('F'):
380 self.compute_foldenrichment()
381 elif scoring_method == ord('d'):
382 self.compute_subtraction()
383 elif scoring_method == ord('m'):
384 self.compute_SPMR()
385 elif scoring_method == ord('M'):
386 self.compute_max()
387 else:
388 raise NotImplemented
390 cdef compute_pvalue ( self ):
391 """Compute -log_{10}(pvalue)
393 cdef:
394 np.ndarray[np.float32_t] p, c, v
395 np.ndarray[np.int32_t] pos
396 int64_t l, i, prev_pos
397 bytes chrom
399 for chrom in sorted(self.data.keys()):
400 prev_pos = 0
401 pos = self.data[chrom][0]
402 p = self.data[chrom][1]
403 c = self.data[chrom][2]
404 v = self.data[chrom][3]
405 l = self.datalength[chrom]
406 for i in range(l):
407 v[ i ] = get_pscore( <int32_t>(p[ i ] + self.pseudocount) , c[ i ] + self.pseudocount )
408 try:
409 self.pvalue_stat[v[ i ]] += pos[ i ] - prev_pos
410 except:
411 self.pvalue_stat[v[ i ]] = pos[ i ] - prev_pos
412 prev_pos = pos[ i ]
414 self.scoring_method = ord('p')
415 return
417 cdef compute_qvalue ( self ):
418 """Compute -log_{10}(qvalue)
420 cdef:
421 object pqtable
422 int64_t i,l,j
423 bytes chrom
424 np.ndarray p, c, v
426 # pvalue should be computed first!
427 assert self.scoring_method == ord('p')
428 # make pqtable
429 pqtable = self.make_pq_table()
431 # convert p to q
432 for chrom in sorted(self.data.keys()):
433 v = self.data[chrom][3]
434 l = self.datalength[chrom]
435 for i in range(l):
436 v[ i ] = pqtable[ v[ i ] ]
437 #v [ i ] = g( v[ i ])
439 self.scoring_method = ord('q')
440 return
442 cpdef object make_pq_table ( self ):
443 """Make pvalue-qvalue table.
445 Step1: get all pvalue and length of block with this pvalue
446 Step2: Sort them
447 Step3: Apply AFDR method to adjust pvalue and get qvalue for each pvalue
449 Return a dictionary of {-log10pvalue:(-log10qvalue,rank,basepairs)} relationships.
451 cdef:
452 int64_t n, pre_p, this_p, length, pre_l, l, i, j
453 float32_t this_v, pre_v, v, q, pre_q # store the p and q scores
454 int64_t N, k
455 float32_t f
456 bytes chrom
457 np.ndarray v_chrom, pos_chrom
458 object pvalue2qvalue
459 dict pvalue_stat
460 list unique_values
462 assert self.scoring_method == ord('p')
464 pvalue_stat = self.pvalue_stat
466 N = sum(pvalue_stat.values())
467 k = 1 # rank
468 f = -log10(N)
469 pre_v = -2147483647
470 pre_l = 0
471 pre_q = 2147483647 # save the previous q-value
473 pvalue2qvalue = Float32to32Map( for_int = False )
474 unique_values = sorted(list(pvalue_stat.keys()), reverse=True)
475 for i in range(len(unique_values)):
476 v = unique_values[i]
477 l = pvalue_stat[v]
478 q = v + (log10(k) + f)
479 if q > pre_q:
480 q = pre_q
481 if q <= 0:
482 q = 0
483 break
484 pvalue2qvalue[ v ] = q
485 pre_q = q
486 k+=l
487 # bottom rank pscores all have qscores 0
488 for j in range(i, len(unique_values) ):
489 v = unique_values[ j ]
490 pvalue2qvalue[ v ] = 0
491 return pvalue2qvalue
493 cdef compute_likelihood ( self ):
494 """Calculate log10 likelihood.
497 cdef:
498 #np.ndarray v, p, c
499 int64_t l, i
500 bytes chrom
501 float32_t v1, v2
502 float32_t pseudocount
504 pseudocount = self.pseudocount
506 for chrom in sorted(self.data.keys()):
507 p = self.data[chrom][ 1 ].flat.__next__ # pileup in treatment
508 c = self.data[chrom][ 2 ].flat.__next__ # pileup in control
509 v = self.data[chrom][ 3 ] # score
510 l = self.datalength[chrom]
511 v1 = 2
512 v2 = 1
513 for i in range(l):
514 v1 = p()
515 v2 = c()
516 v[ i ] = logLR_asym( v1 + pseudocount, v2 + pseudocount ) #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
517 #print v1, v2, v[i]
518 self.scoring_method = ord('l')
519 return
521 cdef compute_sym_likelihood ( self ):
522 """Calculate symmetric log10 likelihood.
525 cdef:
526 #np.ndarray v, p, c
527 int64_t l, i
528 bytes chrom
529 float32_t v1, v2
530 float32_t pseudocount
532 pseudocount = self.pseudocount
534 for chrom in sorted(self.data.keys()):
535 p = self.data[chrom][ 1 ].flat.__next__
536 c = self.data[chrom][ 2 ].flat.__next__
537 v = self.data[chrom][ 3 ]
538 l = self.datalength[chrom]
539 v1 = 2
540 v2 = 1
541 for i in range(l):
542 v1 = p()
543 v2 = c()
544 v[ i ] = logLR_sym( v1 + pseudocount, v2 + pseudocount ) #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
545 self.scoring_method = ord('s')
546 return
548 cdef compute_logFE ( self ):
549 """Calculate log10 fold enrichment ( with 1 pseudocount ).
552 cdef:
553 np.ndarray p, c, v
554 int64_t l, i
555 float32_t pseudocount
557 pseudocount = self.pseudocount
559 for chrom in sorted(self.data.keys()):
560 p = self.data[chrom][1]
561 c = self.data[chrom][2]
562 v = self.data[chrom][3]
563 l = self.datalength[chrom]
564 for i in range(l):
565 v[ i ] = get_logFE ( p[ i ] + pseudocount, c[ i ] + pseudocount)
566 self.scoring_method = ord('f')
567 return
569 cdef compute_foldenrichment ( self ):
570 """Calculate linear scale fold enrichment ( with 1 pseudocount ).
573 cdef:
574 np.ndarray p, c, v
575 int64_t l, i
576 float32_t pseudocount
578 pseudocount = self.pseudocount
580 for chrom in sorted(self.data.keys()):
581 p = self.data[chrom][1]
582 c = self.data[chrom][2]
583 v = self.data[chrom][3]
584 l = self.datalength[chrom]
585 for i in range(l):
586 v[ i ] = ( p[ i ] + pseudocount )/( c[ i ] + pseudocount )
587 self.scoring_method = ord('F')
588 return
590 cdef compute_subtraction ( self ):
591 cdef:
592 np.ndarray p, c, v
593 int64_t l, i
595 for chrom in sorted(self.data.keys()):
596 p = self.data[chrom][1]
597 c = self.data[chrom][2]
598 v = self.data[chrom][3]
599 l = self.datalength[chrom]
600 for i in range(l):
601 v[ i ] = p[ i ] - c[ i ]
602 self.scoring_method = ord('d')
603 return
605 cdef compute_SPMR ( self ):
606 cdef:
607 np.ndarray p, v
608 int64_t l, i
609 float32_t scale
610 if self.normalization_method == ord('T') or self.normalization_method == ord('N'):
611 scale = self.treat_edm
612 elif self.normalization_method == ord('C'):
613 scale = self.ctrl_edm
614 elif self.normalization_method == ord('M'):
615 scale = 1
617 for chrom in sorted(self.data.keys()):
618 p = self.data[chrom][1]
619 v = self.data[chrom][3]
620 l = self.datalength[chrom]
621 for i in range(l):
622 v[ i ] = p[ i ] / scale # two digit precision may not be enough...
623 self.scoring_method = ord('m')
624 return
626 cdef compute_max ( self ):
627 cdef:
628 np.ndarray p, c, v
629 int64_t l, i
631 for chrom in sorted(self.data.keys()):
632 p = self.data[chrom][1]
633 c = self.data[chrom][2]
634 v = self.data[chrom][3]
635 l = self.datalength[chrom]
636 for i in range(l):
637 v[ i ] = max(p[ i ],c[ i ])
638 self.scoring_method = ord('M')
639 return
641 cpdef write_bedGraph ( self, fhd, str name, str description, short column = 3):
642 """Write all data to fhd in bedGraph Format.
644 fhd: a filehandler to save bedGraph.
646 name/description: the name and description in track line.
648 colname: can be 1: chip, 2: control, 3: score
651 cdef:
652 bytes chrom
653 int32_t l, pre, i, p
654 float32_t pre_v, v
655 set chrs
656 np.ndarray pos, value
658 assert column in range( 1, 4 ), "column should be between 1, 2 or 3."
660 write = fhd.write
662 if self.trackline:
663 # this line is REQUIRED by the wiggle format for UCSC browser
664 write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
666 chrs = self.get_chr_names()
667 for chrom in sorted(chrs):
668 pos = self.data[ chrom ][ 0 ]
669 value = self.data[ chrom ][ column ]
670 l = self.datalength[ chrom ]
671 pre = 0
672 if pos.shape[ 0 ] == 0: continue # skip if there's no data
673 pre_v = value[ 0 ]
674 for i in range( 1, l ):
675 v = value[ i ]
676 p = pos[ i-1 ]
677 #if ('%.5f' % pre_v) != ('%.5f' % v):
678 if abs(pre_v - v) > 1e-5: # precision is 5 digits
679 write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
680 pre_v = v
681 pre = p
682 p = pos[ -1 ]
683 # last one
684 write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
686 return True
688 cpdef call_peaks (self, float32_t cutoff=5.0, int32_t min_length=200, int32_t max_gap=50, bool call_summits=False):
689 """This function try to find regions within which, scores
690 are continuously higher than a given cutoff.
692 This function is NOT using sliding-windows. Instead, any
693 regions in bedGraph above certain cutoff will be detected,
694 then merged if the gap between nearby two regions are below
695 max_gap. After this, peak is reported if its length is above
696 min_length.
698 cutoff: cutoff of value, default 5. For -log10pvalue, it means 10^-5.
699 min_length : minimum peak length, default 200.
700 max_gap : maximum gap to merge nearby peaks, default 50.
701 acll_summits:
703 cdef:
704 int32_t i
705 bytes chrom
706 np.ndarray pos, sample, control, value, above_cutoff, above_cutoff_v, above_cutoff_endpos, above_cutoff_startpos, above_cutoff_sv
707 list peak_content
709 chrs = self.get_chr_names()
710 peaks = PeakIO() # dictionary to save peaks
712 self.cutoff = cutoff
713 for chrom in sorted(chrs):
714 peak_content = [] # to store points above cutoff
716 pos = self.data[chrom][ 0 ]
717 sample = self.data[chrom][ 1 ]
718 control = self.data[chrom][ 2 ]
719 value = self.data[chrom][ 3 ]
721 above_cutoff = np.nonzero( value >= cutoff )[0] # indices where score is above cutoff
722 above_cutoff_v = value[above_cutoff] # scores where score is above cutoff
724 above_cutoff_endpos = pos[above_cutoff] # end positions of regions where score is above cutoff
725 above_cutoff_startpos = pos[above_cutoff-1] # start positions of regions where score is above cutoff
726 above_cutoff_sv= sample[above_cutoff] # sample pileup height where score is above cutoff
728 if above_cutoff_v.size == 0:
729 # nothing above cutoff
730 continue
732 if above_cutoff[0] == 0:
733 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
734 above_cutoff_startpos[0] = 0
736 # first bit of region above cutoff
737 peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], above_cutoff_v[0], above_cutoff_sv[0], above_cutoff[0]) )
738 for i in range( 1,above_cutoff_startpos.size ):
739 if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap:
740 # append
741 peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i], above_cutoff_sv[i], above_cutoff[i]) )
742 else:
743 # close
744 if call_summits:
745 self.__close_peak2(peak_content, peaks, min_length, chrom, max_gap//2 )
746 else:
747 self.__close_peak(peak_content, peaks, min_length, chrom )
748 peak_content = [(above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i], above_cutoff_sv[i], above_cutoff[i]),]
750 # save the last peak
751 if not peak_content:
752 continue
753 else:
754 if call_summits:
755 self.__close_peak2(peak_content, peaks, min_length, chrom, max_gap//2 )
756 else:
757 self.__close_peak(peak_content, peaks, min_length, chrom )
759 return peaks
761 cdef bool __close_peak (self, list peak_content, object peaks, int32_t min_length,
762 bytes chrom):
763 """Close the peak region, output peak boundaries, peak summit
764 and scores, then add the peak to peakIO object.
766 In this function, we define the peak summit as the middle
767 point of the region with the highest score, in this peak. For
768 example, if the region of the highest score is from 100 to
769 200, the summit is 150. If there are several regions of the
770 same 'highest score', we will first calculate the possible
771 summit for each such region, then pick a position close to the
772 middle index ( = (len(highest_regions) + 1) / 2 ) of these
773 summits. For example, if there are three regions with the same
774 highest scores, [100,200], [300,400], [600,700], we will first
775 find the possible summits as 150, 350, and 650, and then pick
776 the middle index, the 2nd, of the three positions -- 350 as
777 the final summit. If there are four regions, we pick the 2nd
778 as well.
780 peaks: a PeakIO object
783 cdef:
784 int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
785 float32_t summit_value, tvalue, tsummitvalue
787 peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
788 if peak_length >= min_length: # if the peak is too small, reject it
789 tsummit = []
790 summit_pos = 0
791 summit_value = 0
792 for i in range(len(peak_content)):
793 (tstart,tend,tvalue,tsummitvalue, tindex) = peak_content[i]
794 #for (tstart,tend,tvalue,tsummitvalue, tindex) in peak_content:
795 if not summit_value or summit_value < tsummitvalue:
796 tsummit = [(tend + tstart) / 2, ]
797 tsummit_index = [ tindex, ]
798 summit_value = tsummitvalue
799 elif summit_value == tsummitvalue:
800 # remember continuous summit values
801 tsummit.append(int((tend + tstart) / 2))
802 tsummit_index.append( tindex )
803 # the middle of all highest points in peak region is defined as summit
804 midindex = int((len(tsummit) + 1) / 2) - 1
805 summit_pos = tsummit[ midindex ]
806 summit_index = tsummit_index[ midindex ]
807 if self.scoring_method == ord('q'):
808 qscore = self.data[chrom][3][ summit_index ]
809 else:
810 # if q value is not computed, use -1
811 qscore = -1
813 peaks.add( chrom,
814 peak_content[0][0],
815 peak_content[-1][1],
816 summit = summit_pos,
817 peak_score = self.data[chrom][ 3 ][ summit_index ],
818 pileup = self.data[chrom][ 1 ][ summit_index ], # should be the same as summit_value
819 pscore = get_pscore(self.data[chrom][ 1 ][ summit_index ], self.data[chrom][ 2 ][ summit_index ]),
820 fold_change = ( self.data[chrom][ 1 ][ summit_index ] + self.pseudocount ) / ( self.data[chrom][ 2 ][ summit_index ] + self.pseudocount ),
821 qscore = qscore,
823 # start a new peak
824 return True
826 cdef bool __close_peak2 (self, list peak_content, object peaks, int32_t min_length,
827 bytes chrom, int32_t smoothlen=51,
828 float32_t min_valley = 0.9):
829 """Close the peak region, output peak boundaries, peak summit
830 and scores, then add the peak to peakIO object.
832 In this function, we use signal processing methods to smooth
833 the scores in the peak region, find the maxima and enforce the
834 peaky shape, and to define the best maxima as the peak
835 summit. The functions used for signal processing is 'maxima'
836 (with 2nd order polynomial filter) and 'enfoce_peakyness'
837 functions in SignalProcessing.pyx.
839 peaks: a PeakIO object
841 """
842 cdef:
843 int32_t summit_pos, tstart, tend, tmpindex, summit_index, summit_offset
844 int32_t start, end, i, j, start_boundary
845 float32_t summit_value, tvalue, tsummitvalue
846 # np.ndarray[np.float32_t, ndim=1] w
847 np.ndarray[np.float32_t, ndim=1] peakdata
848 np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets
850 # Add 10 bp padding to peak region so that we can get true minima
851 end = peak_content[ -1 ][ 1 ] + 10
852 start = peak_content[ 0 ][ 0 ] - 10
853 if start < 0:
854 start_boundary = 10 + start
855 start = 0
856 else:
857 start_boundary = 10
858 peak_length = end - start
859 if end - start < min_length: return # if the region is too small, reject it
861 peakdata = np.zeros(end - start, dtype='float32')
862 peakindices = np.zeros(end - start, dtype='int32')
863 for (tstart,tend,tvalue,tsvalue, tmpindex) in peak_content:
864 i = tstart - start + start_boundary
865 j = tend - start + start_boundary
866 peakdata[i:j] = tsvalue
867 peakindices[i:j] = tmpindex
868 summit_offsets = maxima(peakdata, smoothlen)
869 if summit_offsets.shape[0] == 0:
870 # **failsafe** if no summits, fall back on old approach #
871 return self.__close_peak(peak_content, peaks, min_length, chrom)
872 else:
873 # remove maxima that occurred in padding
874 i = np.searchsorted(summit_offsets, start_boundary)
875 j = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right')
876 summit_offsets = summit_offsets[i:j]
878 summit_offsets = enforce_peakyness(peakdata, summit_offsets)
879 if summit_offsets.shape[0] == 0:
880 # **failsafe** if no summits, fall back on old approach #
881 return self.__close_peak(peak_content, peaks, min_length, chrom)
883 summit_indices = peakindices[summit_offsets]
884 summit_offsets -= start_boundary
886 peak_scores = self.data[chrom][3][ summit_indices ]
887 if not (peak_scores > self.cutoff).all():
888 return self.__close_peak(peak_content, peaks, min_length, chrom)
889 for summit_offset, summit_index in zip(summit_offsets, summit_indices):
890 if self.scoring_method == ord('q'):
891 qscore = self.data[chrom][3][ summit_index ]
892 else:
893 # if q value is not computed, use -1
894 qscore = -1
895 peaks.add( chrom,
896 start,
897 end,
898 summit = start + summit_offset,
899 peak_score = self.data[chrom][3][ summit_index ],
900 pileup = self.data[chrom][1][ summit_index ], # should be the same as summit_value
901 pscore = get_pscore(self.data[chrom][ 1 ][ summit_index ], self.data[chrom][ 2 ][ summit_index ]),
902 fold_change = ( self.data[chrom][ 1 ][ summit_index ] + self.pseudocount ) / ( self.data[chrom][ 2 ][ summit_index ] + self.pseudocount ),
903 qscore = qscore,
905 # start a new peak
906 return True
908 cdef int64_t total ( self ):
909 """Return the number of regions in this object.
912 cdef:
913 int64_t t
914 bytes chrom
916 t = 0
917 for chrom in sorted(self.data.keys()):
918 t += self.datalength[chrom]
919 return t
921 cpdef call_broadpeaks (self, float32_t lvl1_cutoff=5.0, float32_t lvl2_cutoff=1.0, int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400):
922 """This function try to find enriched regions within which,
923 scores are continuously higher than a given cutoff for level
924 1, and link them using the gap above level 2 cutoff with a
925 maximum length of lvl2_max_gap.
927 lvl1_cutoff: cutoff of value at enriched regions, default 5.0.
928 lvl2_cutoff: cutoff of value at linkage regions, default 1.0.
929 min_length : minimum peak length, default 200.
930 lvl1_max_gap : maximum gap to merge nearby enriched peaks, default 50.
931 lvl2_max_gap : maximum length of linkage regions, default 400.
933 Return both general PeakIO object for highly enriched regions
934 and gapped broad regions in BroadPeakIO.
936 cdef:
937 int32_t i
938 bytes chrom
940 assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
941 assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
942 lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap)
943 lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap)
944 chrs = lvl1_peaks.peaks.keys()
945 broadpeaks = BroadPeakIO()
946 # use lvl2_peaks as linking regions between lvl1_peaks
947 for chrom in sorted(chrs):
948 lvl1peakschrom = lvl1_peaks.peaks[chrom]
949 lvl2peakschrom = lvl2_peaks.peaks[chrom]
950 lvl1peakschrom_next = iter(lvl1peakschrom).__next__
951 tmppeakset = [] # to temporarily store lvl1 region inside a lvl2 region
952 # our assumption is lvl1 regions should be included in lvl2 regions
953 try:
954 lvl1 = lvl1peakschrom_next()
955 for i in range( len(lvl2peakschrom) ):
956 # for each lvl2 peak, find all lvl1 peaks inside
957 # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
958 lvl2 = lvl2peakschrom[i]
960 while True:
961 if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
962 tmppeakset.append(lvl1)
963 lvl1 = lvl1peakschrom_next()
964 else:
965 # make a hierarchical broad peak
966 #print lvl2["start"], lvl2["end"], lvl2["score"]
967 self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
968 tmppeakset = []
969 break
970 except StopIteration:
971 # no more strong (aka lvl1) peaks left
972 self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
973 tmppeakset = []
974 # add the rest lvl2 peaks
975 for j in range( i+1, len(lvl2peakschrom) ):
976 self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset )
978 return broadpeaks
980 def __add_broadpeak (self, bpeaks, bytes chrom, dict lvl2peak, list lvl1peakset):
981 """Internal function to create broad peak.
984 cdef:
985 int32_t blockNum, thickStart, thickEnd, start, end
986 bytes blockSizes, blockStarts
988 start = lvl2peak["start"]
989 end = lvl2peak["end"]
991 # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
992 if not lvl1peakset:
993 # will complement by adding 1bps start and end to this region
994 # may change in the future if gappedPeak format was improved.
995 bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
996 blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
997 pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
998 qscore = lvl2peak["qscore"] )
999 return bpeaks
1001 thickStart = b"%d" % lvl1peakset[0]["start"]
1002 thickEnd = b"%d" % lvl1peakset[-1]["end"]
1003 blockNum = int(len(lvl1peakset))
1004 blockSizes = b",".join( [b"%d" % x["length"] for x in lvl1peakset] )
1005 blockStarts = b",".join( [b"%d" % (x["start"]-start) for x in lvl1peakset] )
1007 if lvl2peak["start"] != thickStart:
1008 # add 1bp mark for the start of lvl2 peak
1009 thickStart = b"%d" % start
1010 blockNum += 1
1011 blockSizes = b"1,"+blockSizes
1012 blockStarts = b"0,"+blockStarts
1013 if lvl2peak["end"] != thickEnd:
1014 # add 1bp mark for the end of lvl2 peak
1015 thickEnd = b"%d" % end
1016 blockNum += 1
1017 blockSizes = blockSizes+b",1"
1018 blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
1020 # add to BroadPeakIO object
1021 bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
1022 blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts, pileup = lvl2peak["pileup"],
1023 pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
1024 qscore = lvl2peak["qscore"] )
1025 return bpeaks
1027 cdef class TwoConditionScores:
1028 """Class for saving two condition comparison scores.
1030 cdef:
1031 dict data # dictionary for data of each chromosome
1032 dict datalength # length of data array of each chromosome
1033 float32_t cond1_factor # factor to apply to cond1 pileup values
1034 float32_t cond2_factor # factor to apply to cond2 pileup values
1035 float32_t pseudocount # the pseudocount used to calcuate LLR
1036 float32_t cutoff
1037 object t1bdg, c1bdg, t2bdg, c2bdg
1038 dict pvalue_stat1, pvalue_stat2, pvalue_stat3
1040 def __init__ (self, t1bdg, c1bdg, t2bdg, c2bdg, float32_t cond1_factor = 1.0, float32_t cond2_factor = 1.0, float32_t pseudocount = 0.01, float32_t proportion_background_empirical_distribution = 0.99999 ):
1042 t1bdg: a bedGraphTrackI object for treat 1
1043 c1bdg: a bedGraphTrackI object for control 1
1044 t2bdg: a bedGraphTrackI object for treat 2
1045 c2bdg: a bedGraphTrackI object for control 2
1047 cond1_factor: this will be multiplied to values in t1bdg and c1bdg
1048 cond2_factor: this will be multiplied to values in t2bdg and c2bdg
1050 pseudocount: pseudocount, by default 0.01.
1052 proportion_background_empirical_distribution: proportion of genome as the background to build empirical distribution
1056 self.data = {} # for each chromosome, there is a l*4
1057 # matrix. First column: end position
1058 # of a region; Second: treatment
1059 # pileup; third: control pileup ;
1060 # forth: score ( can be
1061 # p/q-value/likelihood
1062 # ratio/fold-enrichment/subtraction
1063 # depending on -c setting)
1064 self.datalength = {}
1065 self.cond1_factor = cond1_factor
1066 self.cond2_factor = cond2_factor
1067 self.pseudocount = pseudocount
1068 self.pvalue_stat1 = {}
1069 self.pvalue_stat2 = {}
1070 self.t1bdg = t1bdg
1071 self.c1bdg = c1bdg
1072 self.t2bdg = t2bdg
1073 self.c2bdg = c2bdg
1075 #self.empirical_distr_llr = [] # save all values in histogram
1077 cpdef set_pseudocount( self, float32_t pseudocount ):
1078 self.pseudocount = pseudocount
1080 cpdef build ( self ):
1081 """Compute scores from 3 types of comparisons and store them in self.data.
1084 cdef:
1085 set common_chrs
1086 bytes chrname
1087 int32_t chrom_max_len
1088 # common chromosome names
1089 common_chrs = self.get_common_chrs()
1090 for chrname in common_chrs:
1091 (cond1_treat_ps, cond1_treat_vs) = self.t1bdg.get_data_by_chr(chrname)
1092 (cond1_control_ps, cond1_control_vs) = self.c1bdg.get_data_by_chr(chrname)
1093 (cond2_treat_ps, cond2_treat_vs) = self.t2bdg.get_data_by_chr(chrname)
1094 (cond2_control_ps, cond2_control_vs) = self.c2bdg.get_data_by_chr(chrname)
1095 chrom_max_len = len(cond1_treat_ps) + len(cond1_control_ps) +\
1096 len(cond2_treat_ps) + len(cond2_control_ps)
1097 self.add_chromosome( chrname, chrom_max_len )
1098 self.build_chromosome( chrname,
1099 cond1_treat_ps, cond1_control_ps,
1100 cond2_treat_ps, cond2_control_ps,
1101 cond1_treat_vs, cond1_control_vs,
1102 cond2_treat_vs, cond2_control_vs )
1105 cdef build_chromosome( self, chrname,
1106 cond1_treat_ps, cond1_control_ps,
1107 cond2_treat_ps, cond2_control_ps,
1108 cond1_treat_vs, cond1_control_vs,
1109 cond2_treat_vs, cond2_control_vs ):
1110 """Internal function to calculate scores for three types of comparisons.
1112 cond1_treat_ps, cond1_control_ps: position of treat and control of condition 1
1113 cond2_treat_ps, cond2_control_ps: position of treat and control of condition 2
1114 cond1_treat_vs, cond1_control_vs: value of treat and control of condition 1
1115 cond2_treat_vs, cond2_control_vs: value of treat and control of condition 2
1118 cdef:
1119 int32_t c1tp, c1cp, c2tp, c2cp, minp, pre_p
1120 float32_t c1tv, c1cv, c2tv, c2cv
1121 c1tpn = iter(cond1_treat_ps).__next__
1122 c1cpn = iter(cond1_control_ps).__next__
1123 c2tpn = iter(cond2_treat_ps).__next__
1124 c2cpn = iter(cond2_control_ps).__next__
1125 c1tvn = iter(cond1_treat_vs).__next__
1126 c1cvn = iter(cond1_control_vs).__next__
1127 c2tvn = iter(cond2_treat_vs).__next__
1128 c2cvn = iter(cond2_control_vs).__next__
1130 pre_p = 0
1132 try:
1133 c1tp = c1tpn()
1134 c1tv = c1tvn()
1136 c1cp = c1cpn()
1137 c1cv = c1cvn()
1139 c2tp = c2tpn()
1140 c2tv = c2tvn()
1142 c2cp = c2cpn()
1143 c2cv = c2cvn()
1145 while True:
1146 minp = min(c1tp, c1cp, c2tp, c2cp)
1147 self.add( chrname, pre_p, c1tv, c1cv, c2tv, c2cv )
1148 pre_p = minp
1149 if c1tp == minp:
1150 c1tp = c1tpn()
1151 c1tv = c1tvn()
1152 if c1cp == minp:
1153 c1cp = c1cpn()
1154 c1cv = c1cvn()
1155 if c2tp == minp:
1156 c2tp = c2tpn()
1157 c2tv = c2tvn()
1158 if c2cp == minp:
1159 c2cp = c2cpn()
1160 c2cv = c2cvn()
1161 except StopIteration:
1162 # meet the end of either bedGraphTrackI, simply exit
1163 pass
1164 return
1166 cdef set get_common_chrs ( self ):
1167 cdef:
1168 set t1chrs, c1chrs, t2chrs, c2chrs, common
1169 t1chrs = self.t1bdg.get_chr_names()
1170 c1chrs = self.c1bdg.get_chr_names()
1171 t2chrs = self.t2bdg.get_chr_names()
1172 c2chrs = self.c2bdg.get_chr_names()
1173 common = reduce(lambda x,y:x.intersection(y), (t1chrs,c1chrs,t2chrs,c2chrs))
1174 return common
1176 cdef add_chromosome ( self, bytes chrom, int32_t chrom_max_len ):
1178 chrom: chromosome name
1179 chrom_max_len: maximum number of data points in this chromosome
1182 if chrom not in self.data:
1183 self.data[chrom] = [ np.zeros( chrom_max_len, dtype="int32" ), # pos
1184 np.zeros( chrom_max_len, dtype="float32" ), # LLR t1 vs c1
1185 np.zeros( chrom_max_len, dtype="float32" ), # LLR t2 vs c2
1186 np.zeros( chrom_max_len, dtype="float32" )] # LLR t1 vs t2
1187 self.datalength[chrom] = 0
1189 cdef add (self, bytes chromosome, int32_t endpos, float32_t t1, float32_t c1, float32_t t2, float32_t c2):
1190 """Take chr-endpos-sample1-control1-sample2-control2 and
1191 compute logLR for t1 vs c1, t2 vs c2, and t1 vs t2, then save
1192 values.
1194 chromosome: chromosome name in string
1195 endpos : end position of each interval in integer
1196 t1 : Sample 1 ChIP pileup value of each interval in float
1197 c1 : Sample 1 Control pileup value of each interval in float
1198 t2 : Sample 2 ChIP pileup value of each interval in float
1199 c2 : Sample 2 Control pileup value of each interval in float
1201 *Warning* Need to add regions continuously.
1203 cdef:
1204 int32_t i
1205 list c
1206 i = self.datalength[chromosome]
1207 c = self.data[chromosome]
1208 c[0][ i ] = endpos
1209 c[1][ i ] = logLR_asym( (t1+self.pseudocount)*self.cond1_factor, (c1+self.pseudocount)*self.cond1_factor )
1210 c[2][ i ] = logLR_asym( (t2+self.pseudocount)*self.cond2_factor, (c2+self.pseudocount)*self.cond2_factor )
1211 c[3][ i ] = logLR_sym( (t1+self.pseudocount)*self.cond1_factor, (t2+self.pseudocount)*self.cond2_factor )
1212 self.datalength[chromosome] += 1
1213 return
1215 cpdef finalize ( self ):
1217 Adjust array size of each chromosome.
1220 cdef:
1221 bytes chrom
1222 int32_t l
1223 list d
1225 for chrom in sorted(self.data.keys()):
1226 d = self.data[chrom]
1227 l = self.datalength[chrom]
1228 d[0].resize( l, refcheck = False )
1229 d[1].resize( l, refcheck = False )
1230 d[2].resize( l, refcheck = False )
1231 d[3].resize( l, refcheck = False )
1232 return
1234 cpdef get_data_by_chr (self, bytes chromosome):
1235 """Return array of counts by chromosome.
1237 The return value is a tuple:
1238 ([end pos],[value])
1240 if chromosome in self.data:
1241 return self.data[chromosome]
1242 else:
1243 return None
1245 cpdef get_chr_names (self):
1246 """Return all the chromosome names stored.
1249 l = set(self.data.keys())
1250 return l
1252 cpdef write_bedGraph ( self, fhd, str name, str description, int32_t column = 3):
1253 """Write all data to fhd in bedGraph Format.
1255 fhd: a filehandler to save bedGraph.
1257 name/description: the name and description in track line.
1259 colname: can be 1: cond1 chip vs cond1 ctrl, 2: cond2 chip vs cond2 ctrl, 3: cond1 chip vs cond2 chip
1262 cdef:
1263 bytes chrom
1264 int32_t l, pre, i, p
1265 float32_t pre_v, v
1266 np.ndarray pos, value
1268 assert column in range( 1, 4 ), "column should be between 1, 2 or 3."
1270 write = fhd.write
1272 #if self.trackline:
1273 # # this line is REQUIRED by the wiggle format for UCSC browser
1274 # write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
1276 chrs = self.get_chr_names()
1277 for chrom in sorted(chrs):
1278 pos = self.data[ chrom ][ 0 ]
1279 value = self.data[ chrom ][ column ]
1280 l = self.datalength[ chrom ]
1281 pre = 0
1282 if pos.shape[ 0 ] == 0: continue # skip if there's no data
1283 pre_v = value[ 0 ]
1284 for i in range( 1, l ):
1285 v = value[ i ]
1286 p = pos[ i-1 ]
1287 if abs(pre_v - v)>=1e-6:
1288 write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
1289 pre_v = v
1290 pre = p
1291 p = pos[ -1 ]
1292 # last one
1293 write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
1295 return True
1297 cpdef write_matrix ( self, fhd, str name, str description ):
1298 """Write all data to fhd into five columns Format:
1300 col1: chr_start_end
1301 col2: t1 vs c1
1302 col3: t2 vs c2
1303 col4: t1 vs t2
1305 fhd: a filehandler to save the matrix.
1308 cdef:
1309 bytes chrom
1310 int32_t l, pre, i, p
1311 float32_t v1, v2, v3
1312 np.ndarray pos, value1, value2, value3
1314 write = fhd.write
1316 chrs = self.get_chr_names()
1317 for chrom in sorted(chrs):
1318 [ pos, value1, value2, value3 ] = self.data[ chrom ]
1319 l = self.datalength[ chrom ]
1320 pre = 0
1321 if pos.shape[ 0 ] == 0: continue # skip if there's no data
1322 for i in range( 0, l ):
1323 v1 = value1[ i ]
1324 v2 = value2[ i ]
1325 v3 = value3[ i ]
1326 p = pos[ i ]
1327 write( "%s:%d_%d\t%.5f\t%.5f\t%.5f\n" % ( chrom.decode(), pre, p, v1, v2, v3 ) )
1328 pre = p
1330 return True
1332 cpdef tuple call_peaks (self, float32_t cutoff=3, int32_t min_length=200, int32_t max_gap = 100,
1333 bool call_summits=False):
1334 """This function try to find regions within which, scores
1335 are continuously higher than a given cutoff.
1337 For bdgdiff.
1339 This function is NOT using sliding-windows. Instead, any
1340 regions in bedGraph above certain cutoff will be detected,
1341 then merged if the gap between nearby two regions are below
1342 max_gap. After this, peak is reported if its length is above
1343 min_length.
1345 cutoff: cutoff of value, default 3. For log10 LR, it means 1000 or -1000.
1346 min_length : minimum peak length, default 200.
1347 max_gap : maximum gap to merge nearby peaks, default 100.
1348 ptrack: an optional track for pileup heights. If it's not None, use it to find summits. Otherwise, use self/scoreTrack.
1350 cdef:
1351 int32_t i
1352 bytes chrom
1353 np.ndarray pos, t1_vs_c1, t2_vs_c2, t1_vs_t2, \
1354 cond1_over_cond2, cond2_over_cond1, cond1_equal_cond2, \
1355 cond1_sig, cond2_sig,\
1356 cat1, cat2, cat3, \
1357 cat1_startpos, cat1_endpos, cat2_startpos, cat2_endpos, \
1358 cat3_startpos, cat3_endpos
1359 chrs = self.get_chr_names()
1360 cat1_peaks = PeakIO() # dictionary to save peaks significant at condition 1
1361 cat2_peaks = PeakIO() # dictionary to save peaks significant at condition 2
1362 cat3_peaks = PeakIO() # dictionary to save peaks significant in both conditions
1364 self.cutoff = cutoff
1366 for chrom in sorted(chrs):
1367 pos = self.data[chrom][ 0 ]
1368 t1_vs_c1 = self.data[chrom][ 1 ]
1369 t2_vs_c2 = self.data[chrom][ 2 ]
1370 t1_vs_t2 = self.data[chrom][ 3 ]
1371 and_ = np.logical_and
1372 cond1_over_cond2 = t1_vs_t2 >= cutoff # regions with stronger cond1 signals
1373 cond2_over_cond1 = t1_vs_t2 <= -1*cutoff # regions with stronger cond2 signals
1374 cond1_equal_cond2= and_( t1_vs_t2 >= -1*cutoff, t1_vs_t2 <= cutoff )
1375 cond1_sig = t1_vs_c1 >= cutoff # enriched regions in condition 1
1376 cond2_sig = t2_vs_c2 >= cutoff # enriched regions in condition 2
1377 # indices where score is above cutoff
1378 cat1 = np.where( and_( cond1_sig, cond1_over_cond2 ) )[ 0 ] # cond1 stronger than cond2, the indices
1379 cat2 = np.where( and_( cond2_over_cond1, cond2_sig ) )[ 0 ] # cond2 stronger than cond1, the indices
1380 cat3 = np.where( and_( and_( cond1_sig, cond2_sig ), # cond1 and cond2 are equal, the indices
1381 cond1_equal_cond2 ) ) [ 0 ]
1383 cat1_endpos = pos[cat1] # end positions of regions where score is above cutoff
1384 cat1_startpos = pos[cat1-1] # start positions of regions where score is above cutoff
1385 cat2_endpos = pos[cat2] # end positions of regions where score is above cutoff
1386 cat2_startpos = pos[cat2-1] # start positions of regions where score is above cutoff
1387 cat3_endpos = pos[cat3] # end positions of regions where score is above cutoff
1388 cat3_startpos = pos[cat3-1] # start positions of regions where score is above cutoff
1390 # for cat1: condition 1 stronger regions
1391 self.__add_a_peak ( cat1_peaks, chrom, cat1, cat1_startpos, cat1_endpos, t1_vs_t2, max_gap, min_length )
1392 # for cat2: condition 2 stronger regions
1393 self.__add_a_peak ( cat2_peaks, chrom, cat2, cat2_startpos, cat2_endpos, -1 * t1_vs_t2, max_gap, min_length )
1394 # for cat3: commonly strong regions
1395 self.__add_a_peak ( cat3_peaks, chrom, cat3, cat3_startpos, cat3_endpos, abs(t1_vs_t2), max_gap, min_length )
1397 return (cat1_peaks, cat2_peaks, cat3_peaks)
1399 cdef object __add_a_peak ( self, object peaks, bytes chrom, np.ndarray indices, np.ndarray startpos, np.ndarray endpos,
1400 np.ndarray score, int32_t max_gap, int32_t min_length ):
1401 """For a given chromosome, merge nearby significant regions,
1402 filter out smaller regions, then add regions to PeakIO
1403 object.
1406 cdef:
1407 int32_t i
1408 list peak_content
1409 float32_t mean_logLR
1411 if startpos.size > 0:
1412 # if it is not empty
1413 peak_content = []
1414 if indices[0] == 0:
1415 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
1416 startpos[0] = 0
1417 # first bit of region above cutoff
1418 peak_content.append( (startpos[0], endpos[0], score[indices[ 0 ]]) )
1419 for i in range( 1, startpos.size ):
1420 if startpos[i] - peak_content[-1][1] <= max_gap:
1421 # append
1422 peak_content.append( ( startpos[i], endpos[i], score[indices[ i ]] ) )
1423 else:
1424 # close
1425 if peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] >= min_length:
1426 mean_logLR = self.mean_from_peakcontent( peak_content )
1427 #if peak_content[0][0] == 22414956:
1428 # print(f"{peak_content} {mean_logLR}")
1429 peaks.add( chrom, peak_content[0][0], peak_content[-1][1],
1430 summit = -1, peak_score = mean_logLR, pileup = 0, pscore = 0,
1431 fold_change = 0, qscore = 0,
1433 peak_content = [(startpos[i], endpos[i], score[ indices[ i ] ]),]
1435 # save the last peak
1436 if peak_content:
1437 if peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] >= min_length:
1438 mean_logLR = self.mean_from_peakcontent( peak_content )
1439 peaks.add( chrom, peak_content[0][0], peak_content[-1][1],
1440 summit = -1, peak_score = mean_logLR, pileup = 0, pscore = 0,
1441 fold_change = 0, qscore = 0,
1444 return
1446 cdef float32_t mean_from_peakcontent ( self, list peakcontent ):
1450 cdef:
1451 int32_t tmp_s, tmp_e
1452 int32_t l
1453 float64_t tmp_v, sum_v #for better precision
1454 float32_t r
1455 int32_t i
1457 l = 0
1458 sum_v = 0 #initialize sum_v as 0
1459 for i in range( len(peakcontent) ):
1460 tmp_s = peakcontent[i][0]
1461 tmp_e = peakcontent[i][1]
1462 tmp_v = peakcontent[i][2]
1463 sum_v += tmp_v * ( tmp_e - tmp_s )
1464 l += tmp_e - tmp_s
1466 r = <float32_t>( sum_v / l )
1467 return r
1470 cdef int64_t total ( self ):
1471 """Return the number of regions in this object.
1474 cdef:
1475 int64_t t
1476 bytes chrom
1478 t = 0
1479 for chrom in sorted(self.data.keys()):
1480 t += self.datalength[chrom]
1481 return t