1 # cython: language_level=3
3 # Time-stamp: <2020-12-04 23:10:35 Tao Liu>
5 """Module for SAPPER PosReadsInfo class.
7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD License (see the file COPYING included
11 with the distribution).
16 @contact: tliu4@buffalo.edu
19 # ------------------------------------
21 # ------------------------------------
22 from MACS3
.Signal
.VariantStat
import CalModel_Homo
, CalModel_Heter_noAS
, CalModel_Heter_AS
, calculate_GQ
, calculate_GQ_heterASsig
23 from MACS3
.Signal
.Prob
import binomial_cdf
24 from MACS3
.Signal
.PeakVariants
import Variant
26 from cpython cimport bool
30 from numpy cimport uint32_t
, uint64_t
, int32_t
, float32_t
32 LN10
= 2.3025850929940458
34 cdef extern from "stdlib.h":
35 ctypedef unsigned
int size_t
36 size_t strlen
(char *s
)
37 void *malloc
(size_t size
)
38 void *calloc
(size_t n
, size_t size
)
40 int strcmp
(char *a
, char *b
)
41 char * strcpy
(char *a
, char *b
)
42 long atol
(char *bytes
)
45 # ------------------------------------
47 # ------------------------------------
48 __version__
= "Parser $Revision$"
49 __author__
= "Tao Liu <tliu4@buffalo.edu>"
50 __doc__
= "All Parser classes"
52 # ------------------------------------
54 # ------------------------------------
56 # ------------------------------------
58 # ------------------------------------
60 cdef class PosReadsInfo
:
65 bool filterout
# if true, do not output
67 dict bq_set_T
#{A:[], C:[], G:[], T:[], N:[]} for treatment
69 dict n_reads_T
#{A:[], C:[], G:[], T:[], N:[]} for treatment
73 list n_strand
#[{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only
74 dict n_tips
# count of nt appearing at tips
78 float top12alleles_ratio
80 double lnL_homo_major
,lnL_heter_AS
,lnL_heter_noAS
,lnL_homo_minor
81 double BIC_homo_major
,BIC_heter_AS
,BIC_heter_noAS
,BIC_homo_minor
82 double PL_00
, PL_01
, PL_11
84 int heter_noAS_kc
, heter_noAS_ki
85 int heter_AS_kc
, heter_AS_ki
86 double heter_AS_alleleratio
88 int GQ_homo_major
,GQ_heter_noAS
,GQ_heter_AS
#phred scale of prob by standard formular
89 int GQ_heter_ASsig
#phred scale of prob, to measure the difference between AS and noAS
95 str mutation_type
# SNV or Insertion or Deletion
97 bool hasfermiinfor
#if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false;
100 def __cinit__
( self ):
101 self.filterout
= False
104 self.alt_allele
= b
'.'
106 def __init__
( self, long ref_pos
, bytes ref_allele
):
107 self.ref_pos
= ref_pos
108 self.ref_allele
= ref_allele
109 self.bq_set_T
= { ref_allele
:[],b
'A':[], b
'C':[], b
'G':[], b
'T':[], b
'N':[], b
'*':[] }
110 self.bq_set_C
= { ref_allele
:[],b
'A':[], b
'C':[], b
'G':[], b
'T':[], b
'N':[], b
'*':[] }
111 self.n_reads_T
= { ref_allele
:0,b
'A':0, b
'C':0, b
'G':0, b
'T':0, b
'N':0, b
'*':0 }
112 self.n_reads_C
= { ref_allele
:0,b
'A':0, b
'C':0, b
'G':0, b
'T':0, b
'N':0, b
'*':0 }
113 self.n_reads
= { ref_allele
:0,b
'A':0, b
'C':0, b
'G':0, b
'T':0, b
'N':0, b
'*':0 }
114 self.n_strand
= [ { ref_allele
:0,b
'A':0, b
'C':0, b
'G':0, b
'T':0, b
'N':0, b
'*':0 }, { ref_allele
:0,b
'A':0, b
'C':0, b
'G':0, b
'T':0, b
'N':0, b
'*':0 } ]
115 self.n_tips
= { ref_allele
:0,b
'A':0, b
'C':0, b
'G':0, b
'T':0, b
'N':0, b
'*':0 }
118 #cpdef void merge ( self, PosReadsInfo PRI2 ):
119 # """Merge two PRIs. No check available.
122 # assert self.ref_pos == PRI2.ref_pos
123 # assert self.ref_allele == PRI2.ref_allele
124 # for b in set( self.n_reads.keys() ).union( set( PRI2.n_reads.keys() ) ):
125 # self.bq_set_T[ b ] = self.bq_set_T.get( b, []).extend( PRI2.bq_set_T.get( b, [] ) )
126 # self.bq_set_C[ b ] = self.bq_set_C.get( b, []).extend( PRI2.bq_set_C.get( b, [] ) )
127 # self.n_reads_T[ b ] = self.n_reads_T.get( b, 0) + PRI2.n_reads_T.get( b, 0 )
128 # self.n_reads_C[ b ] = self.n_reads_C.get( b, 0) + PRI2.n_reads_C.get( b, 0 )
129 # self.n_reads[ b ] = self.n_reads.get( b, 0) + PRI2.n_reads.get( b, 0 )
132 def __getstate__
( self ):
133 return ( self.ref_pos
, self.ref_allele
, self.alt_allele
, self.filterout
,
134 self.bq_set_T
, self.bq_set_C
, self.n_reads_T
, self.n_reads_C
, self.n_reads
, self.n_strand
, self.n_tips
,
135 self.top1allele
, self.top2allele
, self.top12alleles_ratio
,
136 self.lnL_homo_major
, self.lnL_heter_AS
, self.lnL_heter_noAS
, self.lnL_homo_minor
,
137 self.BIC_homo_major
, self.BIC_heter_AS
, self.BIC_heter_noAS
, self.BIC_homo_minor
,
138 self.heter_noAS_kc
, self.heter_noAS_ki
,
139 self.heter_AS_kc
, self.heter_AS_ki
,
140 self.heter_AS_alleleratio
,
141 self.GQ_homo_major
, self.GQ_heter_noAS
, self.GQ_heter_AS
,
149 def __setstate__
( self, state
):
150 ( self.ref_pos
, self.ref_allele
, self.alt_allele
, self.filterout
,
151 self.bq_set_T
, self.bq_set_C
, self.n_reads_T
, self.n_reads_C
, self.n_reads
, self.n_strand
, self.n_tips
,
152 self.top1allele
, self.top2allele
, self.top12alleles_ratio
,
153 self.lnL_homo_major
, self.lnL_heter_AS
, self.lnL_heter_noAS
, self.lnL_homo_minor
,
154 self.BIC_homo_major
, self.BIC_heter_AS
, self.BIC_heter_noAS
, self.BIC_homo_minor
,
155 self.heter_noAS_kc
, self.heter_noAS_ki
,
156 self.heter_AS_kc
, self.heter_AS_ki
,
157 self.heter_AS_alleleratio
,
158 self.GQ_homo_major
, self.GQ_heter_noAS
, self.GQ_heter_AS
,
164 self.fermiNTs
) = state
166 cpdef bool filterflag
( self ):
167 return self.filterout
169 cpdef
void apply_GQ_cutoff
( self, int min_homo_GQ
= 50, int min_heter_GQ
= 100 ):
172 if self.type.startswith
('homo') and self.GQ
< min_homo_GQ
:
173 self.filterout
= True
174 elif self.type.startswith
('heter') and self.GQ
< min_heter_GQ
:
175 self.filterout
= True
178 cpdef
void apply_deltaBIC_cutoff
( self, float min_delta_BIC
= 10 ):
181 if self.deltaBIC
< min_delta_BIC
:
182 self.filterout
= True
185 cpdef
void add_T
( self, int read_index
, bytes read_allele
, int read_bq
, int strand
, bool tip
, int Q
=20 ):
186 """ Strand 0: plus, 1: minus
188 Q is the quality cutoff. By default, only consider Q20 or read_bq > 20.
192 if not self.n_reads
.has_key
( read_allele
):
193 self.bq_set_T
[read_allele
] = []
194 self.bq_set_C
[read_allele
] = []
195 self.n_reads_T
[read_allele
] = 0
196 self.n_reads_C
[read_allele
] = 0
197 self.n_reads
[read_allele
] = 0
198 self.n_strand
[ 0 ][ read_allele
] = 0
199 self.n_strand
[ 1 ][ read_allele
] = 0
200 self.n_tips
[read_allele
] = 0
201 self.bq_set_T
[read_allele
].append
( read_bq
)
202 self.n_reads_T
[ read_allele
] += 1
203 self.n_reads
[ read_allele
] += 1
204 self.n_strand
[ strand
][ read_allele
] += 1
205 if tip
: self.n_tips
[ read_allele
] += 1
207 cpdef
void add_C
( self, int read_index
, bytes read_allele
, int read_bq
, int strand
, int Q
=20 ):
210 if not self.n_reads
.has_key
( read_allele
):
211 self.bq_set_T
[read_allele
] = []
212 self.bq_set_C
[read_allele
] = []
213 self.n_reads_T
[read_allele
] = 0
214 self.n_reads_C
[read_allele
] = 0
215 self.n_reads
[read_allele
] = 0
216 self.n_strand
[ 0 ][ read_allele
] = 0
217 self.n_strand
[ 1 ][ read_allele
] = 0
218 self.n_tips
[read_allele
] = 0
219 self.bq_set_C
[read_allele
].append
( read_bq
)
220 self.n_reads_C
[ read_allele
] += 1
221 self.n_reads
[ read_allele
] += 1
222 #self.n_strand[ strand ][ read_allele ] += 1
224 cpdef
int raw_read_depth
( self, str opt
= "all" ):
226 return sum
( self.n_reads
.values
() )
228 return sum
( self.n_reads_T
.values
() )
230 return sum
( self.n_reads_C
.values
() )
232 raise Exception( "opt should be either 'all', 'T' or 'C'." )
234 cpdef
void update_top_alleles
( self, float min_top12alleles_ratio
= 0.8, int min_altallele_count
= 2, float max_allowed_ar
= 0.95 ):
235 #cpdef update_top_alleles ( self, float min_top12alleles_ratio = 0.8 ):
236 """Identify top1 and top2 NT. the ratio of (top1+top2)/total
241 [self.top1allele
, self.top2allele
] = sorted
(self.n_reads
, key
=self.n_reads_T
.get
, reverse
=True
)[:2]
243 # if top2 allele count in ChIP is lower than
244 # min_altallele_count, or when allele ratio top1/(top1+top2)
245 # is larger than max_allowed_ar in ChIP, we won't consider
246 # this allele at all. we set values of top2 allele in
247 # dictionaries to [] and ignore top2 allele entirely.
249 # max(self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top2allele ]) < min_altallele_count
250 #if self.ref_pos == 52608504:
251 # print self.ref_pos, self.n_reads_T[ self.top1allele ], self.n_reads_T[ self.top2allele ], self.n_reads_C[ self.top1allele ], self.n_reads_C[ self.top2allele ]
252 if self.n_reads_T
[ self.top1allele
] + self.n_reads_T
[ self.top2allele
] == 0:
253 self.filterout
= True
256 if (len(self.top1allele
)==1 and len(self.top2allele
)==1) and ( self.top2allele
!= self.ref_allele
and ( ( self.n_reads_T
[ self.top2allele
] - self.n_tips
[ self.top2allele
] ) < min_altallele_count
) or \
257 self.n_reads_T
[ self.top1allele
]/(self.n_reads_T
[ self.top1allele
] + self.n_reads_T
[ self.top2allele
]) > max_allowed_ar
):
258 self.bq_set_T
[ self.top2allele
] = []
259 self.bq_set_C
[ self.top2allele
] = []
260 self.n_reads_T
[ self.top2allele
] = 0
261 self.n_reads_C
[ self.top2allele
] = 0
262 self.n_reads
[ self.top2allele
] = 0
263 self.n_tips
[ self.top2allele
] = 0
264 if ( self.top1allele
!= self.ref_allele
and ( self.n_reads_T
[ self.top1allele
] - self.n_tips
[ self.top1allele
] ) < min_altallele_count
):
265 self.bq_set_T
[ self.top1allele
] = []
266 self.bq_set_C
[ self.top1allele
] = []
267 self.n_reads_T
[ self.top1allele
] = 0
268 self.n_reads_C
[ self.top1allele
] = 0
269 self.n_reads
[ self.top1allele
] = 0
270 self.n_tips
[ self.top1allele
] = 0
272 if self.n_reads_T
[ self.top1allele
] + self.n_reads_T
[ self.top2allele
] == 0:
273 self.filterout
= True
276 self.top12alleles_ratio
= ( self.n_reads
[ self.top1allele
] + self.n_reads
[ self.top2allele
] ) / sum
( self.n_reads
.values
() )
277 if self.top12alleles_ratio
< min_top12alleles_ratio
:
278 self.filterout
= True
281 if self.top1allele
== self.ref_allele
and self.n_reads
[ self.top2allele
] == 0:
282 # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0
283 self.type = "homo_ref"
284 self.filterout
= True
288 cpdef
void top12alleles
( self ):
289 print ( self.ref_pos
, self.ref_allele
)
290 print ("Top1allele",self.top1allele
, "Treatment", self.bq_set_T
[self.top1allele
], "Control", self.bq_set_C
[self.top1allele
])
291 print ("Top2allele",self.top2allele
, "Treatment", self.bq_set_T
[self.top2allele
], "Control", self.bq_set_C
[self.top2allele
])
293 cpdef
void call_GT
( self, float max_allowed_ar
= 0.99 ):
294 """Require update_top_alleles being called.
297 np
.ndarray
[np
.int32_t
, ndim
=1] top1_bq_T
298 np
.ndarray
[np
.int32_t
, ndim
=1] top2_bq_T
299 np
.ndarray
[np
.int32_t
, ndim
=1] top1_bq_C
300 np
.ndarray
[np
.int32_t
, ndim
=1] top2_bq_C
306 list tmp_mutation_type
312 top1_bq_T
= np
.array
( self.bq_set_T
[ self.top1allele
], dtype
="int32" )
313 top2_bq_T
= np
.array
( self.bq_set_T
[ self.top2allele
], dtype
="int32" )
314 top1_bq_C
= np
.array
( self.bq_set_C
[ self.top1allele
], dtype
="int32" )
315 top2_bq_C
= np
.array
( self.bq_set_C
[ self.top2allele
], dtype
="int32" )
316 (self.lnL_homo_major
, self.BIC_homo_major
) = CalModel_Homo
( top1_bq_T
, top1_bq_C
, top2_bq_T
, top2_bq_C
)
317 (self.lnL_homo_minor
, self.BIC_homo_minor
) = CalModel_Homo
( top2_bq_T
, top2_bq_C
, top1_bq_T
, top1_bq_C
)
318 (self.lnL_heter_noAS
, self.BIC_heter_noAS
) = CalModel_Heter_noAS
( top1_bq_T
, top1_bq_C
, top2_bq_T
, top2_bq_C
)
319 (self.lnL_heter_AS
, self.BIC_heter_AS
) = CalModel_Heter_AS
( top1_bq_T
, top1_bq_C
, top2_bq_T
, top2_bq_C
, max_allowed_ar
)
321 #if self.ref_pos == 71078525:
323 # print len( top1_bq_T ), len( top1_bq_C ), len( top2_bq_T ), len( top2_bq_C )
324 # print self.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS
325 # print self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS
327 if self.top1allele
!= self.ref_allele
and self.n_reads
[ self.top2allele
] == 0:
328 # in this case, there is no top2 nt (or socalled minor
329 # allele) in either treatment or control, we should assume
330 # it's a 1/1 genotype. We will take 1/1 if it passes BIC
331 # test (deltaBIC >=2), and will skip this loci if it can't
334 self.deltaBIC
= min( self.BIC_heter_noAS
, self.BIC_heter_AS
, self.BIC_homo_minor
) - self.BIC_homo_major
335 if self.deltaBIC
< 2:
336 self.filterout
= True
342 self.PL_00
= -10.0 * self.lnL_homo_minor
/ LN10
343 self.PL_01
= -10.0 * max( self.lnL_heter_noAS
, self.lnL_heter_AS
) / LN10
344 self.PL_11
= -10.0 * self.lnL_homo_major
/ LN10
346 self.PL_00
= max( 0, self.PL_00
- self.PL_11
)
347 self.PL_01
= max( 0, self.PL_01
- self.PL_11
)
350 self.GQ
= min( self.PL_00
, self.PL_01
)
351 self.alt_allele
= self.top1allele
353 # assign GQ, GT, and type
354 if self.ref_allele
!= self.top1allele
and self.BIC_homo_major
+ 2 <= self.BIC_homo_minor
and self.BIC_homo_major
+ 2 <= self.BIC_heter_noAS
and self.BIC_homo_major
+ 2 <= self.BIC_heter_AS
:
356 self.deltaBIC
= min( self.BIC_heter_noAS
, self.BIC_heter_AS
, self.BIC_homo_minor
) - self.BIC_homo_major
358 self.alt_allele
= self.top1allele
360 self.PL_00
= -10.0 * self.lnL_homo_minor
/ LN10
361 self.PL_01
= -10.0 * max( self.lnL_heter_noAS
, self.lnL_heter_AS
) / LN10
362 self.PL_11
= -10.0 * self.lnL_homo_major
/ LN10
364 self.PL_00
= self.PL_00
- self.PL_11
365 self.PL_01
= self.PL_01
- self.PL_11
368 self.GQ
= min( self.PL_00
, self.PL_01
)
370 elif self.BIC_heter_noAS
+ 2 <= self.BIC_homo_major
and self.BIC_heter_noAS
+ 2 <= self.BIC_homo_minor
and self.BIC_heter_noAS
+ 2 <= self.BIC_heter_AS
:
371 self.type = "heter_noAS"
372 self.deltaBIC
= min( self.BIC_homo_major
, self.BIC_homo_minor
) - self.BIC_heter_noAS
374 self.PL_00
= -10.0 * self.lnL_homo_minor
/ LN10
375 self.PL_01
= -10.0 * self.lnL_heter_noAS
/ LN10
376 self.PL_11
= -10.0 * self.lnL_homo_major
/ LN10
378 self.PL_00
= self.PL_00
- self.PL_01
379 self.PL_11
= self.PL_11
- self.PL_01
382 self.GQ
= min( self.PL_00
, self.PL_11
)
384 elif self.BIC_heter_AS
+ 2 <= self.BIC_homo_major
and self.BIC_heter_AS
+ 2 <= self.BIC_homo_minor
and self.BIC_heter_AS
+ 2 <= self.BIC_heter_noAS
:
385 self.type = "heter_AS"
386 self.deltaBIC
= min( self.BIC_homo_major
, self.BIC_homo_minor
) - self.BIC_heter_AS
388 self.PL_00
= -10.0 * self.lnL_homo_minor
/ LN10
389 self.PL_01
= -10.0 * self.lnL_heter_AS
/ LN10
390 self.PL_11
= -10.0 * self.lnL_homo_major
/ LN10
392 self.PL_00
= self.PL_00
- self.PL_01
393 self.PL_11
= self.PL_11
- self.PL_01
396 self.GQ
= min( self.PL_00
, self.PL_11
)
398 elif self.BIC_heter_AS
+ 2 <= self.BIC_homo_major
and self.BIC_heter_AS
+ 2 <= self.BIC_homo_minor
:
399 # can't decide if it's noAS or AS
400 self.type = "heter_unsure"
401 self.deltaBIC
= min( self.BIC_homo_major
, self.BIC_homo_minor
) - max( self.BIC_heter_AS
, self.BIC_heter_noAS
)
403 self.PL_00
= -10.0 * self.lnL_homo_minor
/ LN10
404 self.PL_01
= -10.0 * max( self.lnL_heter_noAS
, self.lnL_heter_AS
) / LN10
405 self.PL_11
= -10.0 * self.lnL_homo_major
/ LN10
407 self.PL_00
= self.PL_00
- self.PL_01
408 self.PL_11
= self.PL_11
- self.PL_01
411 self.GQ
= min( self.PL_00
, self.PL_11
)
413 elif self.ref_allele
== self.top1allele
and self.BIC_homo_major
< self.BIC_homo_minor
and self.BIC_homo_major
< self.BIC_heter_noAS
and self.BIC_homo_major
< self.BIC_heter_AS
:
414 self.type = "homo_ref"
415 # we do not calculate GQ if type is homo_ref
417 self.filterout
= True
420 self.filterout
= True
422 if self.type.startswith
( "heter" ):
423 if self.ref_allele
== self.top1allele
:
424 self.alt_allele
= self.top2allele
426 elif self.ref_allele
== self.top2allele
:
427 self.alt_allele
= self.top1allele
430 self.alt_allele
= self.top1allele
+b
','+self.top2allele
432 # strand bias filter, uncomment following if wish to debug
434 #print "calculate SB score for ", self.ref_pos, "a/b/c/d:", self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ]
435 #SBscore = self.SB_score_ChIP( self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ] )
438 # print "disgard variant at", self.ref_pos, "type", self.type
439 # self.filterout = True
441 # if self.ref_allele == self.top1alleleļ¼
442 # self.n_strand[ 0 ][ self.top1allele ] + self.n_strand[ 1 ][ self.top1allele ]
443 # if and self.n_strand[ 0 ][ self.top2allele ] == 0 or self.n_strand[ 1 ][ self.top2allele ] == 0:
444 # self.filterout = True
448 # self.deltaBIC = self.deltaBIC
450 tmp_mutation_type
= []
451 for tmp_alt
in self.alt_allele
.split
(b
','):
453 tmp_mutation_type
.append
( "Deletion" )
454 elif len( tmp_alt
) > 1:
455 tmp_mutation_type
.append
( "Insertion" )
457 tmp_mutation_type
.append
( "SNV" )
458 self.mutation_type
= ",".join
( tmp_mutation_type
)
461 cdef float SB_score_ChIP
( self, int a
, int b
, int c
, int d
):
462 """ calculate score for filtering variants with strange strand biases.
464 a: top1/major allele plus strand
465 b: top2/minor allele plus strand
466 c: top1/major allele minus strand
467 d: top2/minor allele minus strand
469 Return a float value so that if this value >= 1, the variant will be filtered out.
476 double top2_sb
, top1_sb
478 if a
+b
== 0 or c
+d
== 0:
479 # if major allele and minor allele both bias to the same strand, allow it
483 # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
484 # or there is no significant bias (0.5) in top2 allele.
487 p1_l
= binomial_cdf
( a
, (a
+c
), 0.5, lower
=True
) # alternative: less than 0.5
488 p1_r
= binomial_cdf
( c
, (a
+c
), 0.5, lower
=True
) # greater than 0.5
489 p2_l
= binomial_cdf
( b
, (b
+d
), 0.5, lower
=True
) # alternative: less than 0.5
490 p2_r
= binomial_cdf
( d
, (b
+d
), 0.5, lower
=True
) # greater than 0.5
491 #print p1_l, p1_r, p2_l, p2_r
493 if (p1_l
< 0.05 and p2_r
< 0.05) or (p1_r
< 0.05 and p2_l
< 0.05):
494 # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
497 # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs
498 #if ( b<=2 and d==0 or b==0 and d<=2 ):
503 cdef float SB_score_ATAC
( self, int a
, int b
, int c
, int d
):
504 """ calculate score for filtering variants with strange strand biases.
508 a: top1/major allele plus strand
509 b: top2/minor allele plus strand
510 c: top1/major allele minus strand
511 d: top2/minor allele minus strand
513 Return a float value so that if this value >= 1, the variant will be filtered out.
520 double top2_sb
, top1_sb
522 if a
+b
== 0 or c
+d
== 0:
523 # if major allele and minor allele both bias to the same strand, allow it
527 # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
528 # or there is no significant bias (0.5) in top2 allele.
531 p1_l
= binomial_cdf
( a
, (a
+c
), 0.5, lower
=True
) # alternative: less than 0.5
532 p1_r
= binomial_cdf
( c
, (a
+c
), 0.5, lower
=True
) # greater than 0.5
533 p2_l
= binomial_cdf
( b
, (b
+d
), 0.5, lower
=True
) # alternative: less than 0.5
534 p2_r
= binomial_cdf
( d
, (b
+d
), 0.5, lower
=True
) # greater than 0.5
535 #print p1_l, p1_r, p2_l, p2_r
537 if (p1_l
< 0.05 and p2_r
< 0.05) or (p1_r
< 0.05 and p2_l
< 0.05):
538 # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
544 cpdef
str to_vcf
( self ):
545 """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns.
548 str vcf_ref
, vcf_alt
, vcf_qual
, vcf_filter
, vcf_info
, vcf_format
, vcf_sample
550 vcf_ref
= self.ref_allele
.decode
()
551 vcf_alt
= self.alt_allele
.decode
()
552 vcf_qual
= "%d" % self.GQ
554 vcf_info
= (b
"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \
555 (self.type.encode
(), self.mutation_type
.encode
(), sum
( self.n_reads_T
.values
() ), sum
( self.n_reads_C
.values
() ),
556 self.n_reads_T
[self.top1allele
], self.top1allele
, self.n_reads_T
[self.top2allele
], self.top2allele
,
557 self.n_reads_C
[self.top1allele
], self.top1allele
, self.n_reads_C
[self.top2allele
], self.top2allele
,
558 self.n_strand
[ 0 ][ self.top1allele
], self.n_strand
[ 0 ][ self.top2allele
], self.n_strand
[ 1 ][ self.top1allele
], self.n_strand
[ 1 ][ self.top2allele
],
560 self.BIC_homo_major
, self.BIC_homo_minor
, self.BIC_heter_noAS
,self.BIC_heter_AS
,
561 self.n_reads_T
[self.top1allele
]/(self.n_reads_T
[self.top1allele
]+self.n_reads_T
[self.top2allele
])
563 vcf_format
= "GT:DP:GQ:PL"
564 vcf_sample
= "%s:%d:%d:%d,%d,%d" % (self.GT
, self.raw_read_depth
( opt
= "all" ), self.GQ
, self.PL_00
, self.PL_01
, self.PL_11
)
565 return "\t".join
( ( vcf_ref
, vcf_alt
, vcf_qual
, vcf_filter
, vcf_info
, vcf_format
, vcf_sample
) )
567 cpdef toVariant
( self ):
571 self.ref_allele
.decode
(),
572 self.alt_allele
.decode
(),
577 self.top1allele
.decode
(),
578 self.top2allele
.decode
(),
579 sum
( self.n_reads_T
.values
() ),
580 sum
( self.n_reads_C
.values
() ),
581 self.n_reads_T
[self.top1allele
],
582 self.n_reads_T
[self.top2allele
],
583 self.n_reads_C
[self.top1allele
],
584 self.n_reads_C
[self.top2allele
],
585 self.n_strand
[ 0 ][ self.top1allele
],
586 self.n_strand
[ 0 ][ self.top2allele
],
587 self.n_strand
[ 1 ][ self.top1allele
],
588 self.n_strand
[ 1 ][ self.top2allele
],
594 self.n_reads_T
[self.top1allele
]/(self.n_reads_T
[self.top1allele
]+self.n_reads_T
[self.top2allele
]),
596 self.raw_read_depth
( opt
= "all" ),