Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / PosReadsInfo.pyx
blob93d38348090df5dbdb9492942dbc8368bcf3ebec
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2020-12-04 23:10:35 Tao Liu>
5 """Module for SAPPER PosReadsInfo class.
7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD License (see the file COPYING included
11 with the distribution).
13 @status: experimental
14 @version: $Revision$
15 @author: Tao Liu
16 @contact: tliu4@buffalo.edu
17 """
19 # ------------------------------------
20 # python modules
21 # ------------------------------------
22 from MACS3.Signal.VariantStat import CalModel_Homo, CalModel_Heter_noAS, CalModel_Heter_AS, calculate_GQ, calculate_GQ_heterASsig
23 from MACS3.Signal.Prob import binomial_cdf
24 from MACS3.Signal.PeakVariants import Variant
26 from cpython cimport bool
28 import numpy as np
29 cimport numpy as np
30 from numpy cimport uint32_t, uint64_t, int32_t, float32_t
32 LN10 = 2.3025850929940458
34 cdef extern from "stdlib.h":
35 ctypedef unsigned int size_t
36 size_t strlen(char *s)
37 void *malloc(size_t size)
38 void *calloc(size_t n, size_t size)
39 void free(void *ptr)
40 int strcmp(char *a, char *b)
41 char * strcpy(char *a, char *b)
42 long atol(char *bytes)
43 int atoi(char *bytes)
45 # ------------------------------------
46 # constants
47 # ------------------------------------
48 __version__ = "Parser $Revision$"
49 __author__ = "Tao Liu <tliu4@buffalo.edu>"
50 __doc__ = "All Parser classes"
52 # ------------------------------------
53 # Misc functions
54 # ------------------------------------
56 # ------------------------------------
57 # Classes
58 # ------------------------------------
60 cdef class PosReadsInfo:
61 cdef:
62 long ref_pos
63 bytes ref_allele
64 bytes alt_allele
65 bool filterout # if true, do not output
67 dict bq_set_T #{A:[], C:[], G:[], T:[], N:[]} for treatment
68 dict bq_set_C
69 dict n_reads_T #{A:[], C:[], G:[], T:[], N:[]} for treatment
70 dict n_reads_C
71 dict n_reads
73 list n_strand #[{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only
74 dict n_tips # count of nt appearing at tips
76 bytes top1allele
77 bytes top2allele
78 float top12alleles_ratio
80 double lnL_homo_major,lnL_heter_AS,lnL_heter_noAS,lnL_homo_minor
81 double BIC_homo_major,BIC_heter_AS,BIC_heter_noAS,BIC_homo_minor
82 double PL_00, PL_01, PL_11
83 double deltaBIC
84 int heter_noAS_kc, heter_noAS_ki
85 int heter_AS_kc, heter_AS_ki
86 double heter_AS_alleleratio
88 int GQ_homo_major,GQ_heter_noAS,GQ_heter_AS #phred scale of prob by standard formular
89 int GQ_heter_ASsig #phred scale of prob, to measure the difference between AS and noAS
91 double GQ
93 str GT
94 str type
95 str mutation_type # SNV or Insertion or Deletion
97 bool hasfermiinfor #if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false;
98 bytearray fermiNTs #
100 def __cinit__ ( self ):
101 self.filterout = False
102 self.GQ = 0
103 self.GT = "unsure"
104 self.alt_allele = b'.'
106 def __init__ ( self, long ref_pos, bytes ref_allele ):
107 self.ref_pos = ref_pos
108 self.ref_allele = ref_allele
109 self.bq_set_T = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] }
110 self.bq_set_C = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] }
111 self.n_reads_T = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
112 self.n_reads_C = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
113 self.n_reads = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
114 self.n_strand = [ { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }, { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } ]
115 self.n_tips = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
118 #cpdef void merge ( self, PosReadsInfo PRI2 ):
119 # """Merge two PRIs. No check available.
121 # """
122 # assert self.ref_pos == PRI2.ref_pos
123 # assert self.ref_allele == PRI2.ref_allele
124 # for b in set( self.n_reads.keys() ).union( set( PRI2.n_reads.keys() ) ):
125 # self.bq_set_T[ b ] = self.bq_set_T.get( b, []).extend( PRI2.bq_set_T.get( b, [] ) )
126 # self.bq_set_C[ b ] = self.bq_set_C.get( b, []).extend( PRI2.bq_set_C.get( b, [] ) )
127 # self.n_reads_T[ b ] = self.n_reads_T.get( b, 0) + PRI2.n_reads_T.get( b, 0 )
128 # self.n_reads_C[ b ] = self.n_reads_C.get( b, 0) + PRI2.n_reads_C.get( b, 0 )
129 # self.n_reads[ b ] = self.n_reads.get( b, 0) + PRI2.n_reads.get( b, 0 )
130 # return
132 def __getstate__ ( self ):
133 return ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout,
134 self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips,
135 self.top1allele, self.top2allele, self.top12alleles_ratio,
136 self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor,
137 self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor,
138 self.heter_noAS_kc, self.heter_noAS_ki,
139 self.heter_AS_kc, self.heter_AS_ki,
140 self.heter_AS_alleleratio,
141 self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS,
142 self.GQ_heter_ASsig,
143 self.GQ,
144 self.GT,
145 self.type,
146 self.hasfermiinfor,
147 self.fermiNTs )
149 def __setstate__ ( self, state ):
150 ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout,
151 self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips,
152 self.top1allele, self.top2allele, self.top12alleles_ratio,
153 self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor,
154 self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor,
155 self.heter_noAS_kc, self.heter_noAS_ki,
156 self.heter_AS_kc, self.heter_AS_ki,
157 self.heter_AS_alleleratio,
158 self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS,
159 self.GQ_heter_ASsig,
160 self.GQ,
161 self.GT,
162 self.type,
163 self.hasfermiinfor,
164 self.fermiNTs ) = state
166 cpdef bool filterflag ( self ):
167 return self.filterout
169 cpdef void apply_GQ_cutoff ( self, int min_homo_GQ = 50, int min_heter_GQ = 100 ):
170 if self.filterout:
171 return
172 if self.type.startswith('homo') and self.GQ < min_homo_GQ:
173 self.filterout = True
174 elif self.type.startswith('heter') and self.GQ < min_heter_GQ:
175 self.filterout = True
176 return
178 cpdef void apply_deltaBIC_cutoff ( self, float min_delta_BIC = 10 ):
179 if self.filterout:
180 return
181 if self.deltaBIC < min_delta_BIC:
182 self.filterout = True
183 return
185 cpdef void add_T ( self, int read_index, bytes read_allele, int read_bq, int strand, bool tip, int Q=20 ):
186 """ Strand 0: plus, 1: minus
188 Q is the quality cutoff. By default, only consider Q20 or read_bq > 20.
190 if read_bq <= Q:
191 return
192 if not self.n_reads.has_key( read_allele ):
193 self.bq_set_T[read_allele] = []
194 self.bq_set_C[read_allele] = []
195 self.n_reads_T[read_allele] = 0
196 self.n_reads_C[read_allele] = 0
197 self.n_reads[read_allele] = 0
198 self.n_strand[ 0 ][ read_allele ] = 0
199 self.n_strand[ 1 ][ read_allele ] = 0
200 self.n_tips[read_allele] = 0
201 self.bq_set_T[read_allele].append( read_bq )
202 self.n_reads_T[ read_allele ] += 1
203 self.n_reads[ read_allele ] += 1
204 self.n_strand[ strand ][ read_allele ] += 1
205 if tip: self.n_tips[ read_allele ] += 1
207 cpdef void add_C ( self, int read_index, bytes read_allele, int read_bq, int strand, int Q=20 ):
208 if read_bq <= Q:
209 return
210 if not self.n_reads.has_key( read_allele ):
211 self.bq_set_T[read_allele] = []
212 self.bq_set_C[read_allele] = []
213 self.n_reads_T[read_allele] = 0
214 self.n_reads_C[read_allele] = 0
215 self.n_reads[read_allele] = 0
216 self.n_strand[ 0 ][ read_allele ] = 0
217 self.n_strand[ 1 ][ read_allele ] = 0
218 self.n_tips[read_allele] = 0
219 self.bq_set_C[read_allele].append( read_bq )
220 self.n_reads_C[ read_allele ] += 1
221 self.n_reads[ read_allele ] += 1
222 #self.n_strand[ strand ][ read_allele ] += 1
224 cpdef int raw_read_depth ( self, str opt = "all" ):
225 if opt == "all":
226 return sum( self.n_reads.values() )
227 elif opt == "T":
228 return sum( self.n_reads_T.values() )
229 elif opt == "C":
230 return sum( self.n_reads_C.values() )
231 else:
232 raise Exception( "opt should be either 'all', 'T' or 'C'." )
234 cpdef void update_top_alleles ( self, float min_top12alleles_ratio = 0.8, int min_altallele_count = 2, float max_allowed_ar = 0.95 ):
235 #cpdef update_top_alleles ( self, float min_top12alleles_ratio = 0.8 ):
236 """Identify top1 and top2 NT. the ratio of (top1+top2)/total
238 cdef:
239 float r
241 [self.top1allele, self.top2allele] = sorted(self.n_reads, key=self.n_reads_T.get, reverse=True)[:2]
243 # if top2 allele count in ChIP is lower than
244 # min_altallele_count, or when allele ratio top1/(top1+top2)
245 # is larger than max_allowed_ar in ChIP, we won't consider
246 # this allele at all. we set values of top2 allele in
247 # dictionaries to [] and ignore top2 allele entirely.
249 # max(self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top2allele ]) < min_altallele_count
250 #if self.ref_pos == 52608504:
251 # print self.ref_pos, self.n_reads_T[ self.top1allele ], self.n_reads_T[ self.top2allele ], self.n_reads_C[ self.top1allele ], self.n_reads_C[ self.top2allele ]
252 if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0:
253 self.filterout = True
254 return
256 if (len(self.top1allele)==1 and len(self.top2allele)==1) and ( self.top2allele != self.ref_allele and ( ( self.n_reads_T[ self.top2allele ] - self.n_tips[ self.top2allele ] ) < min_altallele_count ) or \
257 self.n_reads_T[ self.top1allele ]/(self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ]) > max_allowed_ar ):
258 self.bq_set_T[ self.top2allele ] = []
259 self.bq_set_C[ self.top2allele ] = []
260 self.n_reads_T[ self.top2allele ] = 0
261 self.n_reads_C[ self.top2allele ] = 0
262 self.n_reads[ self.top2allele ] = 0
263 self.n_tips[ self.top2allele ] = 0
264 if ( self.top1allele != self.ref_allele and ( self.n_reads_T[ self.top1allele ] - self.n_tips[ self.top1allele ] ) < min_altallele_count ):
265 self.bq_set_T[ self.top1allele ] = []
266 self.bq_set_C[ self.top1allele ] = []
267 self.n_reads_T[ self.top1allele ] = 0
268 self.n_reads_C[ self.top1allele ] = 0
269 self.n_reads[ self.top1allele ] = 0
270 self.n_tips[ self.top1allele ] = 0
272 if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0:
273 self.filterout = True
274 return
276 self.top12alleles_ratio = ( self.n_reads[ self.top1allele ] + self.n_reads[ self.top2allele ] ) / sum( self.n_reads.values() )
277 if self.top12alleles_ratio < min_top12alleles_ratio:
278 self.filterout = True
279 return
281 if self.top1allele == self.ref_allele and self.n_reads[ self.top2allele ] == 0:
282 # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0
283 self.type = "homo_ref"
284 self.filterout = True
285 return
286 return
288 cpdef void top12alleles ( self ):
289 print ( self.ref_pos, self.ref_allele)
290 print ("Top1allele",self.top1allele, "Treatment", self.bq_set_T[self.top1allele], "Control", self.bq_set_C[self.top1allele])
291 print ("Top2allele",self.top2allele, "Treatment", self.bq_set_T[self.top2allele], "Control", self.bq_set_C[self.top2allele])
293 cpdef void call_GT ( self, float max_allowed_ar = 0.99 ):
294 """Require update_top_alleles being called.
296 cdef:
297 np.ndarray[np.int32_t, ndim=1] top1_bq_T
298 np.ndarray[np.int32_t, ndim=1] top2_bq_T
299 np.ndarray[np.int32_t, ndim=1] top1_bq_C
300 np.ndarray[np.int32_t, ndim=1] top2_bq_C
301 int i
302 list top1_bq_T_l
303 list top2_bq_T_l
304 list top1_bq_C_l
305 list top2_bq_C_l
306 list tmp_mutation_type
307 bytes tmp_alt
309 if self.filterout:
310 return
312 top1_bq_T = np.array( self.bq_set_T[ self.top1allele ], dtype="int32" )
313 top2_bq_T = np.array( self.bq_set_T[ self.top2allele ], dtype="int32" )
314 top1_bq_C = np.array( self.bq_set_C[ self.top1allele ], dtype="int32" )
315 top2_bq_C = np.array( self.bq_set_C[ self.top2allele ], dtype="int32" )
316 (self.lnL_homo_major, self.BIC_homo_major) = CalModel_Homo( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C )
317 (self.lnL_homo_minor, self.BIC_homo_minor) = CalModel_Homo( top2_bq_T, top2_bq_C, top1_bq_T, top1_bq_C )
318 (self.lnL_heter_noAS, self.BIC_heter_noAS) = CalModel_Heter_noAS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C )
319 (self.lnL_heter_AS, self.BIC_heter_AS) = CalModel_Heter_AS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C, max_allowed_ar )
321 #if self.ref_pos == 71078525:
322 # print "---"
323 # print len( top1_bq_T ), len( top1_bq_C ), len( top2_bq_T ), len( top2_bq_C )
324 # print self.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS
325 # print self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS
327 if self.top1allele != self.ref_allele and self.n_reads[ self.top2allele ] == 0:
328 # in this case, there is no top2 nt (or socalled minor
329 # allele) in either treatment or control, we should assume
330 # it's a 1/1 genotype. We will take 1/1 if it passes BIC
331 # test (deltaBIC >=2), and will skip this loci if it can't
332 # pass the test.
334 self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major
335 if self.deltaBIC < 2:
336 self.filterout = True
337 return
339 self.type = "homo"
340 self.GT = "1/1"
342 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
343 self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
344 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
346 self.PL_00 = max( 0, self.PL_00 - self.PL_11 )
347 self.PL_01 = max( 0, self.PL_01 - self.PL_11 )
348 self.PL_11 = 0
350 self.GQ = min( self.PL_00, self.PL_01 )
351 self.alt_allele = self.top1allele
352 else:
353 # assign GQ, GT, and type
354 if self.ref_allele != self.top1allele and self.BIC_homo_major + 2 <= self.BIC_homo_minor and self.BIC_homo_major + 2 <= self.BIC_heter_noAS and self.BIC_homo_major + 2 <= self.BIC_heter_AS:
355 self.type = "homo"
356 self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major
357 self.GT = "1/1"
358 self.alt_allele = self.top1allele
360 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
361 self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
362 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
364 self.PL_00 = self.PL_00 - self.PL_11
365 self.PL_01 = self.PL_01 - self.PL_11
366 self.PL_11 = 0
368 self.GQ = min( self.PL_00, self.PL_01 )
370 elif self.BIC_heter_noAS + 2 <= self.BIC_homo_major and self.BIC_heter_noAS + 2 <= self.BIC_homo_minor and self.BIC_heter_noAS + 2 <= self.BIC_heter_AS :
371 self.type = "heter_noAS"
372 self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_noAS
374 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
375 self.PL_01 = -10.0 * self.lnL_heter_noAS / LN10
376 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
378 self.PL_00 = self.PL_00 - self.PL_01
379 self.PL_11 = self.PL_11 - self.PL_01
380 self.PL_01 = 0
382 self.GQ = min( self.PL_00, self.PL_11 )
384 elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor and self.BIC_heter_AS + 2 <= self.BIC_heter_noAS:
385 self.type = "heter_AS"
386 self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_AS
388 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
389 self.PL_01 = -10.0 * self.lnL_heter_AS / LN10
390 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
392 self.PL_00 = self.PL_00 - self.PL_01
393 self.PL_11 = self.PL_11 - self.PL_01
394 self.PL_01 = 0
396 self.GQ = min( self.PL_00, self.PL_11 )
398 elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor:
399 # can't decide if it's noAS or AS
400 self.type = "heter_unsure"
401 self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - max( self.BIC_heter_AS, self.BIC_heter_noAS )
403 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
404 self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
405 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
407 self.PL_00 = self.PL_00 - self.PL_01
408 self.PL_11 = self.PL_11 - self.PL_01
409 self.PL_01 = 0
411 self.GQ = min( self.PL_00, self.PL_11 )
413 elif self.ref_allele == self.top1allele and self.BIC_homo_major < self.BIC_homo_minor and self.BIC_homo_major < self.BIC_heter_noAS and self.BIC_homo_major < self.BIC_heter_AS:
414 self.type = "homo_ref"
415 # we do not calculate GQ if type is homo_ref
416 self.GT = "0/0"
417 self.filterout = True
418 else:
419 self.type="unsure"
420 self.filterout = True
422 if self.type.startswith( "heter" ):
423 if self.ref_allele == self.top1allele:
424 self.alt_allele = self.top2allele
425 self.GT = "0/1"
426 elif self.ref_allele == self.top2allele:
427 self.alt_allele = self.top1allele
428 self.GT = "0/1"
429 else:
430 self.alt_allele = self.top1allele+b','+self.top2allele
431 self.GT = "1/2"
432 # strand bias filter, uncomment following if wish to debug
433 # calculate SB score
434 #print "calculate SB score for ", self.ref_pos, "a/b/c/d:", self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ]
435 #SBscore = self.SB_score_ChIP( self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ] )
436 #SBscore = 0
437 #if SBscore >= 1:
438 # print "disgard variant at", self.ref_pos, "type", self.type
439 # self.filterout = True
441 # if self.ref_allele == self.top1alleleļ¼š
442 # self.n_strand[ 0 ][ self.top1allele ] + self.n_strand[ 1 ][ self.top1allele ]
443 # if and self.n_strand[ 0 ][ self.top2allele ] == 0 or self.n_strand[ 1 ][ self.top2allele ] == 0:
444 # self.filterout = True
445 # print self.ref_pos
448 # self.deltaBIC = self.deltaBIC
450 tmp_mutation_type = []
451 for tmp_alt in self.alt_allele.split(b','):
452 if tmp_alt == b'*':
453 tmp_mutation_type.append( "Deletion" )
454 elif len( tmp_alt ) > 1:
455 tmp_mutation_type.append( "Insertion" )
456 else:
457 tmp_mutation_type.append( "SNV" )
458 self.mutation_type = ",".join( tmp_mutation_type )
459 return
461 cdef float SB_score_ChIP( self, int a, int b, int c, int d ):
462 """ calculate score for filtering variants with strange strand biases.
464 a: top1/major allele plus strand
465 b: top2/minor allele plus strand
466 c: top1/major allele minus strand
467 d: top2/minor allele minus strand
469 Return a float value so that if this value >= 1, the variant will be filtered out.
471 cdef:
472 float score
473 double p
474 double p1_l, p1_r
475 double p2_l, p2_r
476 double top2_sb, top1_sb
478 if a+b == 0 or c+d == 0:
479 # if major allele and minor allele both bias to the same strand, allow it
480 return 0.0
482 # Rule:
483 # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
484 # or there is no significant bias (0.5) in top2 allele.
486 #print a, b, c, d
487 p1_l = binomial_cdf( a, (a+c), 0.5, lower=True ) # alternative: less than 0.5
488 p1_r = binomial_cdf( c, (a+c), 0.5, lower=True ) # greater than 0.5
489 p2_l = binomial_cdf( b, (b+d), 0.5, lower=True ) # alternative: less than 0.5
490 p2_r = binomial_cdf( d, (b+d), 0.5, lower=True ) # greater than 0.5
491 #print p1_l, p1_r, p2_l, p2_r
493 if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
494 # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
495 return 1.0
496 else:
497 # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs
498 #if ( b<=2 and d==0 or b==0 and d<=2 ):
499 # return 1
500 # can't decide
501 return 0.0
503 cdef float SB_score_ATAC( self, int a, int b, int c, int d ):
504 """ calculate score for filtering variants with strange strand biases.
506 ATAC-seq version
508 a: top1/major allele plus strand
509 b: top2/minor allele plus strand
510 c: top1/major allele minus strand
511 d: top2/minor allele minus strand
513 Return a float value so that if this value >= 1, the variant will be filtered out.
515 cdef:
516 float score
517 double p
518 double p1_l, p1_r
519 double p2_l, p2_r
520 double top2_sb, top1_sb
522 if a+b == 0 or c+d == 0:
523 # if major allele and minor allele both bias to the same strand, allow it
524 return 0.0
526 # Rule:
527 # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
528 # or there is no significant bias (0.5) in top2 allele.
530 #print a, b, c, d
531 p1_l = binomial_cdf( a, (a+c), 0.5, lower=True ) # alternative: less than 0.5
532 p1_r = binomial_cdf( c, (a+c), 0.5, lower=True ) # greater than 0.5
533 p2_l = binomial_cdf( b, (b+d), 0.5, lower=True ) # alternative: less than 0.5
534 p2_r = binomial_cdf( d, (b+d), 0.5, lower=True ) # greater than 0.5
535 #print p1_l, p1_r, p2_l, p2_r
537 if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
538 # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
539 return 1.0
540 else:
541 # can't decide
542 return 0.0
544 cpdef str to_vcf ( self ):
545 """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns.
547 cdef:
548 str vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample
550 vcf_ref = self.ref_allele.decode()
551 vcf_alt = self.alt_allele.decode()
552 vcf_qual = "%d" % self.GQ
553 vcf_filter = "."
554 vcf_info = (b"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \
555 (self.type.encode(), self.mutation_type.encode(), sum( self.n_reads_T.values() ), sum( self.n_reads_C.values() ),
556 self.n_reads_T[self.top1allele], self.top1allele, self.n_reads_T[self.top2allele], self.top2allele,
557 self.n_reads_C[self.top1allele], self.top1allele, self.n_reads_C[self.top2allele], self.top2allele,
558 self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ],
559 self.deltaBIC,
560 self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS,self.BIC_heter_AS,
561 self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele])
562 )).decode()
563 vcf_format = "GT:DP:GQ:PL"
564 vcf_sample = "%s:%d:%d:%d,%d,%d" % (self.GT, self.raw_read_depth( opt = "all" ), self.GQ, self.PL_00, self.PL_01, self.PL_11)
565 return "\t".join( ( vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample ) )
567 cpdef toVariant ( self ):
568 cdef:
569 object v
570 v = Variant(
571 self.ref_allele.decode(),
572 self.alt_allele.decode(),
573 self.GQ,
574 '.',
575 self.type,
576 self.mutation_type,
577 self.top1allele.decode(),
578 self.top2allele.decode(),
579 sum( self.n_reads_T.values() ),
580 sum( self.n_reads_C.values() ),
581 self.n_reads_T[self.top1allele],
582 self.n_reads_T[self.top2allele],
583 self.n_reads_C[self.top1allele],
584 self.n_reads_C[self.top2allele],
585 self.n_strand[ 0 ][ self.top1allele ],
586 self.n_strand[ 0 ][ self.top2allele ],
587 self.n_strand[ 1 ][ self.top1allele ],
588 self.n_strand[ 1 ][ self.top2allele ],
589 self.deltaBIC,
590 self.BIC_homo_major,
591 self.BIC_homo_minor,
592 self.BIC_heter_noAS,
593 self.BIC_heter_AS,
594 self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]),
595 self.GT,
596 self.raw_read_depth( opt = "all" ),
597 self.PL_00,
598 self.PL_01,
599 self.PL_11 )
600 return v