MACS3/Signal/PosReadsInfo.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2020-12-04 23:10:35 Tao Liu>
   4
   5 """Module for SAPPER PosReadsInfo class.
   6
   7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
   8
   9 This code is free software; you can redistribute it and/or modify it
  10 under the terms of the BSD License (see the file COPYING included
  11 with the distribution).
  12
  13 @status:  experimental
  14 @version: $Revision$
  15 @author:  Tao Liu
  16 @contact: tliu4@buffalo.edu
  17 """
  18
  19 # ------------------------------------
  20 # python modules
  21 # ------------------------------------
  22 from MACS3.Signal.VariantStat import CalModel_Homo, CalModel_Heter_noAS, CalModel_Heter_AS, calculate_GQ, calculate_GQ_heterASsig
  23 from MACS3.Signal.Prob import binomial_cdf
  24 from MACS3.Signal.PeakVariants import Variant
  25
  26 from cpython cimport bool
  27
  28 import numpy as np
  29 cimport numpy as np
  30 from numpy cimport uint32_t, uint64_t, int32_t, float32_t
  31
  32 LN10 = 2.3025850929940458
  33
  34 cdef extern from "stdlib.h":
  35     ctypedef unsigned int size_t
  36     size_t strlen(char *s)
  37     void *malloc(size_t size)
  38     void *calloc(size_t n, size_t size)
  39     void free(void *ptr)
  40     int strcmp(char *a, char *b)
  41     char * strcpy(char *a, char *b)
  42     long atol(char *bytes)
  43     int atoi(char *bytes)
  44
  45 # ------------------------------------
  46 # constants
  47 # ------------------------------------
  48 __version__ = "Parser $Revision$"
  49 __author__ = "Tao Liu <tliu4@buffalo.edu>"
  50 __doc__ = "All Parser classes"
  51
  52 # ------------------------------------
  53 # Misc functions
  54 # ------------------------------------
  55
  56 # ------------------------------------
  57 # Classes
  58 # ------------------------------------
  59
  60 cdef class PosReadsInfo:
  61     cdef:
  62         long ref_pos
  63         bytes ref_allele
  64         bytes alt_allele
  65         bool filterout          # if true, do not output
  66
  67         dict bq_set_T     #{A:[], C:[], G:[], T:[], N:[]} for treatment
  68         dict bq_set_C
  69         dict n_reads_T    #{A:[], C:[], G:[], T:[], N:[]} for treatment
  70         dict n_reads_C
  71         dict n_reads
  72
  73         list n_strand #[{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only
  74         dict n_tips # count of nt appearing at tips
  75
  76         bytes top1allele
  77         bytes top2allele
  78         float top12alleles_ratio
  79
  80         double lnL_homo_major,lnL_heter_AS,lnL_heter_noAS,lnL_homo_minor
  81         double BIC_homo_major,BIC_heter_AS,BIC_heter_noAS,BIC_homo_minor
  82         double PL_00, PL_01, PL_11
  83         double deltaBIC
  84         int heter_noAS_kc, heter_noAS_ki
  85         int heter_AS_kc, heter_AS_ki
  86         double heter_AS_alleleratio
  87
  88         int GQ_homo_major,GQ_heter_noAS,GQ_heter_AS  #phred scale of prob by standard formular
  89         int GQ_heter_ASsig #phred scale of prob, to measure the difference between AS and noAS
  90
  91         double GQ
  92
  93         str GT
  94         str type
  95         str mutation_type       # SNV or Insertion or Deletion
  96
  97         bool hasfermiinfor #if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false;
  98         bytearray fermiNTs #
  99
 100     def __cinit__ ( self ):
 101         self.filterout = False
 102         self.GQ = 0
 103         self.GT = "unsure"
 104         self.alt_allele = b'.'
 105
 106     def __init__ ( self, long ref_pos, bytes ref_allele ):
 107         self.ref_pos = ref_pos
 108         self.ref_allele = ref_allele
 109         self.bq_set_T = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] }
 110         self.bq_set_C = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] }
 111         self.n_reads_T = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
 112         self.n_reads_C = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
 113         self.n_reads =  { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
 114         self.n_strand = [ { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }, { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } ]
 115         self.n_tips = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
 116
 117
 118     #cpdef void merge ( self, PosReadsInfo PRI2 ):
 119     #    """Merge two PRIs. No check available.
 120     #
 121     #    """
 122     #    assert self.ref_pos == PRI2.ref_pos
 123     #    assert self.ref_allele == PRI2.ref_allele
 124     #    for b in set( self.n_reads.keys() ).union( set( PRI2.n_reads.keys() ) ):
 125     #        self.bq_set_T[ b ] = self.bq_set_T.get( b, []).extend( PRI2.bq_set_T.get( b, [] ) )
 126     #        self.bq_set_C[ b ] = self.bq_set_C.get( b, []).extend( PRI2.bq_set_C.get( b, [] ) )
 127     #        self.n_reads_T[ b ] = self.n_reads_T.get( b, 0) + PRI2.n_reads_T.get( b, 0 )
 128     #        self.n_reads_C[ b ] = self.n_reads_C.get( b, 0) + PRI2.n_reads_C.get( b, 0 )
 129     #        self.n_reads[ b ] = self.n_reads.get( b, 0) + PRI2.n_reads.get( b, 0 )
 130     #    return
 131
 132     def __getstate__ ( self ):
 133         return ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout,
 134                  self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips,
 135                  self.top1allele, self.top2allele, self.top12alleles_ratio,
 136                  self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor,
 137                  self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor,
 138                  self.heter_noAS_kc, self.heter_noAS_ki,
 139                  self.heter_AS_kc, self.heter_AS_ki,
 140                  self.heter_AS_alleleratio,
 141                  self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS,
 142                  self.GQ_heter_ASsig,
 143                  self.GQ,
 144                  self.GT,
 145                  self.type,
 146                  self.hasfermiinfor,
 147                  self.fermiNTs )
 148
 149     def __setstate__ ( self, state ):
 150         ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout,
 151           self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips,
 152           self.top1allele, self.top2allele, self.top12alleles_ratio,
 153           self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor,
 154           self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor,
 155           self.heter_noAS_kc, self.heter_noAS_ki,
 156           self.heter_AS_kc, self.heter_AS_ki,
 157           self.heter_AS_alleleratio,
 158           self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS,
 159           self.GQ_heter_ASsig,
 160           self.GQ,
 161           self.GT,
 162           self.type,
 163           self.hasfermiinfor,
 164           self.fermiNTs ) = state
 165
 166     cpdef bool filterflag ( self ):
 167         return self.filterout
 168
 169     cpdef void apply_GQ_cutoff ( self, int min_homo_GQ = 50, int min_heter_GQ = 100 ):
 170         if self.filterout:
 171             return
 172         if self.type.startswith('homo') and self.GQ < min_homo_GQ:
 173             self.filterout = True
 174         elif self.type.startswith('heter') and self.GQ < min_heter_GQ:
 175             self.filterout = True
 176         return
 177
 178     cpdef void apply_deltaBIC_cutoff ( self, float min_delta_BIC = 10 ):
 179         if self.filterout:
 180             return
 181         if self.deltaBIC < min_delta_BIC:
 182             self.filterout = True
 183         return
 184
 185     cpdef void add_T ( self, int read_index, bytes read_allele, int read_bq, int strand, bool tip, int Q=20 ):
 186         """ Strand 0: plus, 1: minus
 187
 188         Q is the quality cutoff. By default, only consider Q20 or read_bq > 20.
 189         """
 190         if read_bq <= Q:
 191             return
 192         if not self.n_reads.has_key( read_allele ):
 193             self.bq_set_T[read_allele] = []
 194             self.bq_set_C[read_allele] = []
 195             self.n_reads_T[read_allele] = 0
 196             self.n_reads_C[read_allele] = 0
 197             self.n_reads[read_allele] = 0
 198             self.n_strand[ 0 ][ read_allele ] = 0
 199             self.n_strand[ 1 ][ read_allele ] = 0
 200             self.n_tips[read_allele] = 0
 201         self.bq_set_T[read_allele].append( read_bq )
 202         self.n_reads_T[ read_allele ] += 1
 203         self.n_reads[ read_allele ] += 1
 204         self.n_strand[ strand ][ read_allele ] += 1
 205         if tip: self.n_tips[ read_allele ] += 1
 206
 207     cpdef void add_C ( self, int read_index, bytes read_allele, int read_bq, int strand, int Q=20 ):
 208         if read_bq <= Q:
 209             return
 210         if not self.n_reads.has_key( read_allele ):
 211             self.bq_set_T[read_allele] = []
 212             self.bq_set_C[read_allele] = []
 213             self.n_reads_T[read_allele] = 0
 214             self.n_reads_C[read_allele] = 0
 215             self.n_reads[read_allele] = 0
 216             self.n_strand[ 0 ][ read_allele ] = 0
 217             self.n_strand[ 1 ][ read_allele ] = 0
 218             self.n_tips[read_allele] = 0
 219         self.bq_set_C[read_allele].append( read_bq )
 220         self.n_reads_C[ read_allele ] += 1
 221         self.n_reads[ read_allele ] += 1
 222         #self.n_strand[ strand ][ read_allele ] += 1
 223
 224     cpdef int raw_read_depth ( self, str opt = "all" ):
 225         if opt == "all":
 226             return sum( self.n_reads.values() )
 227         elif opt == "T":
 228             return sum( self.n_reads_T.values() )
 229         elif opt == "C":
 230             return sum( self.n_reads_C.values() )
 231         else:
 232             raise Exception( "opt should be either 'all', 'T' or 'C'." )
 233
 234     cpdef void update_top_alleles ( self, float min_top12alleles_ratio = 0.8, int min_altallele_count = 2, float max_allowed_ar = 0.95 ):
 235     #cpdef update_top_alleles ( self, float min_top12alleles_ratio = 0.8 ):
 236         """Identify top1 and top2 NT.  the ratio of (top1+top2)/total
 237         """
 238         cdef:
 239             float r
 240
 241         [self.top1allele, self.top2allele] = sorted(self.n_reads, key=self.n_reads_T.get, reverse=True)[:2]
 242
 243         # if top2 allele count in ChIP is lower than
 244         # min_altallele_count, or when allele ratio top1/(top1+top2)
 245         # is larger than max_allowed_ar in ChIP, we won't consider
 246         # this allele at all.  we set values of top2 allele in
 247         # dictionaries to [] and ignore top2 allele entirely.
 248
 249         # max(self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top2allele ]) < min_altallele_count
 250         #if self.ref_pos == 52608504:
 251         #    print self.ref_pos, self.n_reads_T[ self.top1allele ], self.n_reads_T[ self.top2allele ], self.n_reads_C[ self.top1allele ], self.n_reads_C[ self.top2allele ]
 252         if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0:
 253             self.filterout = True
 254             return
 255
 256         if (len(self.top1allele)==1 and len(self.top2allele)==1) and ( self.top2allele != self.ref_allele and ( ( self.n_reads_T[ self.top2allele ] - self.n_tips[ self.top2allele ] ) < min_altallele_count ) or \
 257                 self.n_reads_T[ self.top1allele ]/(self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ]) > max_allowed_ar ):
 258             self.bq_set_T[ self.top2allele ] = []
 259             self.bq_set_C[ self.top2allele ] = []
 260             self.n_reads_T[ self.top2allele ] = 0
 261             self.n_reads_C[ self.top2allele ] = 0
 262             self.n_reads[ self.top2allele ] = 0
 263             self.n_tips[ self.top2allele ] = 0
 264             if ( self.top1allele != self.ref_allele and ( self.n_reads_T[ self.top1allele ] - self.n_tips[ self.top1allele ] ) < min_altallele_count ):
 265                 self.bq_set_T[ self.top1allele ] = []
 266                 self.bq_set_C[ self.top1allele ] = []
 267                 self.n_reads_T[ self.top1allele ] = 0
 268                 self.n_reads_C[ self.top1allele ] = 0
 269                 self.n_reads[ self.top1allele ] = 0
 270                 self.n_tips[ self.top1allele ] = 0
 271
 272         if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0:
 273             self.filterout = True
 274             return
 275
 276         self.top12alleles_ratio = ( self.n_reads[ self.top1allele ] + self.n_reads[ self.top2allele ] ) /  sum( self.n_reads.values() )
 277         if self.top12alleles_ratio < min_top12alleles_ratio:
 278             self.filterout = True
 279             return
 280
 281         if self.top1allele == self.ref_allele and self.n_reads[ self.top2allele ] == 0:
 282             # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0
 283             self.type = "homo_ref"
 284             self.filterout = True
 285             return
 286         return
 287
 288     cpdef void top12alleles ( self ):
 289         print ( self.ref_pos, self.ref_allele)
 290         print ("Top1allele",self.top1allele, "Treatment", self.bq_set_T[self.top1allele], "Control", self.bq_set_C[self.top1allele])
 291         print ("Top2allele",self.top2allele, "Treatment", self.bq_set_T[self.top2allele], "Control", self.bq_set_C[self.top2allele])
 292
 293     cpdef void call_GT ( self, float max_allowed_ar = 0.99 ):
 294         """Require update_top_alleles being called.
 295         """
 296         cdef:
 297             np.ndarray[np.int32_t, ndim=1] top1_bq_T
 298             np.ndarray[np.int32_t, ndim=1] top2_bq_T
 299             np.ndarray[np.int32_t, ndim=1] top1_bq_C
 300             np.ndarray[np.int32_t, ndim=1] top2_bq_C
 301             int i
 302             list top1_bq_T_l
 303             list top2_bq_T_l
 304             list top1_bq_C_l
 305             list top2_bq_C_l
 306             list tmp_mutation_type
 307             bytes tmp_alt
 308
 309         if self.filterout:
 310             return
 311
 312         top1_bq_T = np.array( self.bq_set_T[ self.top1allele ], dtype="int32" )
 313         top2_bq_T = np.array( self.bq_set_T[ self.top2allele ], dtype="int32" )
 314         top1_bq_C = np.array( self.bq_set_C[ self.top1allele ], dtype="int32" )
 315         top2_bq_C = np.array( self.bq_set_C[ self.top2allele ], dtype="int32" )
 316         (self.lnL_homo_major, self.BIC_homo_major) = CalModel_Homo( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C )
 317         (self.lnL_homo_minor, self.BIC_homo_minor) = CalModel_Homo( top2_bq_T, top2_bq_C, top1_bq_T, top1_bq_C )
 318         (self.lnL_heter_noAS, self.BIC_heter_noAS) = CalModel_Heter_noAS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C )
 319         (self.lnL_heter_AS, self.BIC_heter_AS)     = CalModel_Heter_AS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C, max_allowed_ar )
 320
 321         #if self.ref_pos == 71078525:
 322         #    print "---"
 323         #    print len( top1_bq_T ), len( top1_bq_C ), len( top2_bq_T ), len( top2_bq_C )
 324         #    print self.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS
 325         #    print self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS
 326
 327         if self.top1allele != self.ref_allele and self.n_reads[ self.top2allele ] == 0:
 328             # in this case, there is no top2 nt (or socalled minor
 329             # allele) in either treatment or control, we should assume
 330             # it's a 1/1 genotype. We will take 1/1 if it passes BIC
 331             # test (deltaBIC >=2), and will skip this loci if it can't
 332             # pass the test.
 333
 334             self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major
 335             if self.deltaBIC < 2:
 336                self.filterout = True
 337                return
 338
 339             self.type = "homo"
 340             self.GT = "1/1"
 341
 342             self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
 343             self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
 344             self.PL_11 = -10.0 * self.lnL_homo_major / LN10
 345
 346             self.PL_00 = max( 0, self.PL_00 - self.PL_11 )
 347             self.PL_01 = max( 0, self.PL_01 - self.PL_11 )
 348             self.PL_11 = 0
 349
 350             self.GQ = min( self.PL_00, self.PL_01 )
 351             self.alt_allele = self.top1allele
 352         else:
 353             # assign GQ, GT, and type
 354             if self.ref_allele != self.top1allele and self.BIC_homo_major + 2 <= self.BIC_homo_minor and self.BIC_homo_major + 2 <= self.BIC_heter_noAS and self.BIC_homo_major + 2 <= self.BIC_heter_AS:
 355                 self.type = "homo"
 356                 self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major
 357                 self.GT = "1/1"
 358                 self.alt_allele = self.top1allele
 359
 360                 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
 361                 self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
 362                 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
 363
 364                 self.PL_00 = self.PL_00 - self.PL_11
 365                 self.PL_01 = self.PL_01 - self.PL_11
 366                 self.PL_11 = 0
 367
 368                 self.GQ = min( self.PL_00, self.PL_01 )
 369
 370             elif self.BIC_heter_noAS + 2 <= self.BIC_homo_major and self.BIC_heter_noAS + 2 <= self.BIC_homo_minor and self.BIC_heter_noAS + 2 <= self.BIC_heter_AS :
 371                 self.type = "heter_noAS"
 372                 self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_noAS
 373
 374                 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
 375                 self.PL_01 = -10.0 * self.lnL_heter_noAS / LN10
 376                 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
 377
 378                 self.PL_00 = self.PL_00 - self.PL_01
 379                 self.PL_11 = self.PL_11 - self.PL_01
 380                 self.PL_01 = 0
 381
 382                 self.GQ = min( self.PL_00, self.PL_11 )
 383
 384             elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor and self.BIC_heter_AS + 2 <= self.BIC_heter_noAS:
 385                 self.type = "heter_AS"
 386                 self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_AS
 387
 388                 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
 389                 self.PL_01 = -10.0 * self.lnL_heter_AS / LN10
 390                 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
 391
 392                 self.PL_00 = self.PL_00 - self.PL_01
 393                 self.PL_11 = self.PL_11 - self.PL_01
 394                 self.PL_01 = 0
 395
 396                 self.GQ = min( self.PL_00, self.PL_11 )
 397
 398             elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor:
 399                 # can't decide if it's noAS or AS
 400                 self.type = "heter_unsure"
 401                 self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - max( self.BIC_heter_AS, self.BIC_heter_noAS )
 402
 403                 self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
 404                 self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
 405                 self.PL_11 = -10.0 * self.lnL_homo_major / LN10
 406
 407                 self.PL_00 = self.PL_00 - self.PL_01
 408                 self.PL_11 = self.PL_11 - self.PL_01
 409                 self.PL_01 = 0
 410
 411                 self.GQ = min( self.PL_00, self.PL_11 )
 412
 413             elif self.ref_allele == self.top1allele and self.BIC_homo_major < self.BIC_homo_minor and self.BIC_homo_major < self.BIC_heter_noAS and self.BIC_homo_major < self.BIC_heter_AS:
 414                 self.type = "homo_ref"
 415                 # we do not calculate GQ if type is homo_ref
 416                 self.GT = "0/0"
 417                 self.filterout = True
 418             else:
 419                 self.type="unsure"
 420                 self.filterout = True
 421
 422             if self.type.startswith( "heter" ):
 423                 if self.ref_allele == self.top1allele:
 424                     self.alt_allele = self.top2allele
 425                     self.GT = "0/1"
 426                 elif self.ref_allele == self.top2allele:
 427                     self.alt_allele = self.top1allele
 428                     self.GT = "0/1"
 429                 else:
 430                     self.alt_allele = self.top1allele+b','+self.top2allele
 431                     self.GT = "1/2"
 432                 # strand bias filter, uncomment following if wish to debug
 433                 # calculate SB score
 434                 #print "calculate SB score for ", self.ref_pos, "a/b/c/d:", self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ]
 435                 #SBscore = self.SB_score_ChIP( self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ] )
 436                 #SBscore = 0
 437                 #if SBscore >= 1:
 438                 #    print "disgard variant at", self.ref_pos, "type", self.type
 439                 #    self.filterout = True
 440
 441                 # if self.ref_allele == self.top1allele：
 442                 #     self.n_strand[ 0 ][ self.top1allele ] + self.n_strand[ 1 ][ self.top1allele ]
 443                 #     if and self.n_strand[ 0 ][ self.top2allele ] == 0 or self.n_strand[ 1 ][ self.top2allele ] == 0:
 444                 #         self.filterout = True
 445                 #         print self.ref_pos
 446
 447
 448             # self.deltaBIC = self.deltaBIC
 449
 450         tmp_mutation_type = []
 451         for tmp_alt in self.alt_allele.split(b','):
 452             if tmp_alt == b'*':
 453                 tmp_mutation_type.append( "Deletion" )
 454             elif len( tmp_alt ) > 1:
 455                 tmp_mutation_type.append( "Insertion" )
 456             else:
 457                 tmp_mutation_type.append( "SNV" )
 458         self.mutation_type = ",".join( tmp_mutation_type )
 459         return
 460
 461     cdef float SB_score_ChIP( self, int a, int b, int c, int d ):
 462         """ calculate score for filtering variants with strange strand biases.
 463
 464         a: top1/major allele plus strand
 465         b: top2/minor allele plus strand
 466         c: top1/major allele minus strand
 467         d: top2/minor allele minus strand
 468
 469         Return a float value so that if this value >= 1, the variant will be filtered out.
 470         """
 471         cdef:
 472             float score
 473             double p
 474             double p1_l, p1_r
 475             double p2_l, p2_r
 476             double top2_sb, top1_sb
 477
 478         if a+b == 0 or c+d == 0:
 479             # if major allele and minor allele both bias to the same strand, allow it
 480             return 0.0
 481
 482         # Rule:
 483         # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
 484         # or there is no significant bias (0.5) in top2 allele.
 485
 486         #print a, b, c, d
 487         p1_l = binomial_cdf( a, (a+c), 0.5, lower=True )      # alternative: less than 0.5
 488         p1_r = binomial_cdf( c, (a+c), 0.5, lower=True )   #              greater than 0.5
 489         p2_l = binomial_cdf( b, (b+d), 0.5, lower=True )      # alternative: less than 0.5
 490         p2_r = binomial_cdf( d, (b+d), 0.5, lower=True )   #              greater than 0.5
 491         #print p1_l, p1_r, p2_l, p2_r
 492
 493         if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
 494             # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
 495             return 1.0
 496         else:
 497             # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs
 498             #if ( b<=2 and d==0 or b==0 and d<=2 ):
 499             #    return 1
 500             # can't decide
 501             return 0.0
 502
 503     cdef float SB_score_ATAC( self, int a, int b, int c, int d ):
 504         """ calculate score for filtering variants with strange strand biases.
 505
 506         ATAC-seq version
 507
 508         a: top1/major allele plus strand
 509         b: top2/minor allele plus strand
 510         c: top1/major allele minus strand
 511         d: top2/minor allele minus strand
 512
 513         Return a float value so that if this value >= 1, the variant will be filtered out.
 514         """
 515         cdef:
 516             float score
 517             double p
 518             double p1_l, p1_r
 519             double p2_l, p2_r
 520             double top2_sb, top1_sb
 521
 522         if a+b == 0 or c+d == 0:
 523             # if major allele and minor allele both bias to the same strand, allow it
 524             return 0.0
 525
 526         # Rule:
 527         # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
 528         # or there is no significant bias (0.5) in top2 allele.
 529
 530         #print a, b, c, d
 531         p1_l = binomial_cdf( a, (a+c), 0.5, lower=True )      # alternative: less than 0.5
 532         p1_r = binomial_cdf( c, (a+c), 0.5, lower=True )   #              greater than 0.5
 533         p2_l = binomial_cdf( b, (b+d), 0.5, lower=True )      # alternative: less than 0.5
 534         p2_r = binomial_cdf( d, (b+d), 0.5, lower=True )   #              greater than 0.5
 535         #print p1_l, p1_r, p2_l, p2_r
 536
 537         if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
 538             # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
 539             return 1.0
 540         else:
 541             # can't decide
 542             return 0.0
 543
 544     cpdef str to_vcf ( self ):
 545         """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns.
 546         """
 547         cdef:
 548             str vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample
 549
 550         vcf_ref = self.ref_allele.decode()
 551         vcf_alt = self.alt_allele.decode()
 552         vcf_qual = "%d" % self.GQ
 553         vcf_filter = "."
 554         vcf_info = (b"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \
 555             (self.type.encode(), self.mutation_type.encode(), sum( self.n_reads_T.values() ), sum( self.n_reads_C.values() ),
 556              self.n_reads_T[self.top1allele], self.top1allele, self.n_reads_T[self.top2allele], self.top2allele,
 557              self.n_reads_C[self.top1allele], self.top1allele, self.n_reads_C[self.top2allele], self.top2allele,
 558              self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ],
 559              self.deltaBIC,
 560              self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS,self.BIC_heter_AS,
 561              self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele])
 562              )).decode()
 563         vcf_format = "GT:DP:GQ:PL"
 564         vcf_sample = "%s:%d:%d:%d,%d,%d" % (self.GT, self.raw_read_depth( opt = "all" ), self.GQ, self.PL_00, self.PL_01, self.PL_11)
 565         return "\t".join( ( vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample ) )
 566
 567     cpdef toVariant ( self ):
 568         cdef:
 569             object v
 570         v = Variant(
 571                      self.ref_allele.decode(),
 572                      self.alt_allele.decode(),
 573                      self.GQ,
 574                      '.',
 575                      self.type,
 576                      self.mutation_type,
 577                      self.top1allele.decode(),
 578                      self.top2allele.decode(),
 579                      sum( self.n_reads_T.values() ),
 580                      sum( self.n_reads_C.values() ),
 581                      self.n_reads_T[self.top1allele],
 582                      self.n_reads_T[self.top2allele],
 583                      self.n_reads_C[self.top1allele],
 584                      self.n_reads_C[self.top2allele],
 585                      self.n_strand[ 0 ][ self.top1allele ],
 586                      self.n_strand[ 0 ][ self.top2allele ],
 587                      self.n_strand[ 1 ][ self.top1allele ],
 588                      self.n_strand[ 1 ][ self.top2allele ],
 589                      self.deltaBIC,
 590                      self.BIC_homo_major,
 591                      self.BIC_homo_minor,
 592                      self.BIC_heter_noAS,
 593                      self.BIC_heter_AS,
 594                      self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]),
 595                      self.GT,
 596                      self.raw_read_depth( opt = "all" ),
 597                      self.PL_00,
 598                      self.PL_01,
 599                      self.PL_11 )
 600         return v