MACS3/Signal/VariantStat.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2020-12-04 18:41:28 Tao Liu>
   4
   5 """Module for SAPPER BAMParser class
   6
   7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
   8
   9 This code is free software; you can redistribute it and/or modify it
  10 under the terms of the BSD License (see the file COPYING included
  11 with the distribution).
  12
  13 @status:  experimental
  14 @version: $Revision$
  15 @author:  Tao Liu
  16 @contact: tliu4@buffalo.edu
  17 """
  18
  19 # ------------------------------------
  20 # python modules
  21 # ------------------------------------
  22 from cpython cimport bool
  23
  24 cimport cython
  25
  26 import numpy as np
  27 cimport numpy as np
  28
  29 ctypedef np.float32_t float32_t
  30 ctypedef np.int32_t int32_t
  31
  32 #from libc.math cimport log10, log, exp, M_LN10 #,fabs,log1p
  33 #from libc.math cimport M_LN10
  34 from math import log1p, exp, log
  35
  36 LN10 = 2.3025850929940458
  37 LN10_tenth = 0.23025850929940458
  38
  39 @cython.boundscheck(False) # turn off bounds-checking for entire function
  40 @cython.wraparound(False)  # turn off negative index wrapping for entire function
  41 cpdef tuple CalModel_Homo( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C):
  42     """Return (lnL, BIC).
  43
  44     """
  45     cdef:
  46         int i
  47         double lnL, BIC
  48
  49     lnL=0
  50     # Phred score is Phred = -10log_{10} E, where E is the error rate.
  51     # to get the 1-E: 1-E = 1-exp( Phred/-10*M_LN10 ) = 1-exp( Phred * -LOG10_E_tenth )
  52     for i in range( top1_bq_T.shape[0] ):
  53         lnL += log1p( -exp(-top1_bq_T[ i ]*LN10_tenth) )
  54     for i in range( top1_bq_C.shape[0] ):
  55         lnL += log1p( -exp(-top1_bq_C[ i ]*LN10_tenth) )
  56
  57     for i in range( top2_bq_T.shape[0] ):
  58         lnL += log( exp(-top2_bq_T[ i ]*LN10_tenth) )
  59     for i in range( top2_bq_C.shape[0] ):
  60         lnL += log( exp(-top2_bq_C[ i ]*LN10_tenth) )
  61
  62     BIC = -2*lnL                # no free variable, no penalty
  63     return (lnL, BIC)
  64
  65 @cython.boundscheck(False) # turn off bounds-checking for entire function
  66 @cython.wraparound(False)  # turn off negative index wrapping for entire function
  67 cpdef tuple CalModel_Heter_noAS( np.ndarray[int32_t, ndim=1] top1_bq_T,np.ndarray[int32_t, ndim=1] top1_bq_C,np.ndarray[int32_t, ndim=1] top2_bq_T,np.ndarray[int32_t, ndim=1] top2_bq_C ):
  68     """Return (lnL, BIC)
  69
  70     k_T
  71     k_C
  72     """
  73     cdef:
  74         int k_T, k_C
  75         double lnL, BIC
  76         int i
  77         int tn_T, tn_C, tn  # total observed NTs
  78         double lnL_T, lnL_C # log likelihood for treatment and control
  79
  80     lnL = 0
  81     BIC = 0
  82     #for k_T
  83     # total oberseved treatment reads from top1 and top2 NTs
  84     tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
  85
  86     if tn_T == 0:
  87         raise Exception("Total number of treatment reads is 0!")
  88     else:
  89         ( lnL_T, k_T ) = GreedyMaxFunctionNoAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T )
  90         lnL += lnL_T
  91         BIC += -2*lnL_T
  92
  93     #for k_C
  94     tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
  95
  96     if tn_C == 0:
  97         pass
  98     else:
  99         ( lnL_C, k_C )  = GreedyMaxFunctionNoAS( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C )
 100         lnL += lnL_C
 101         BIC += -2*lnL_C
 102
 103     tn = tn_C + tn_T
 104
 105     # we penalize big model depending on the number of reads/samples
 106     if tn_T == 0:
 107         BIC += log( tn_C )
 108     elif tn_C == 0:
 109         BIC += log( tn_T )
 110     else:
 111         BIC += log( tn_T ) + log( tn_C )
 112
 113     return ( lnL, BIC )
 114
 115
 116 @cython.boundscheck(False) # turn off bounds-checking for entire function
 117 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 118 cpdef tuple CalModel_Heter_AS( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C, float max_allowed_ar = 0.99 ):
 119     """Return (lnL, BIC)
 120
 121     kc
 122     ki
 123     AS_alleleratio
 124     """
 125     cdef:
 126         int k_T, k_C
 127         double lnL, BIC
 128         int i
 129         int tn_T, tn_C, tn  # total observed NTs
 130         double lnL_T, lnL_C # log likelihood for treatment and control
 131         double AS_alleleratio   # allele ratio
 132
 133     lnL = 0
 134     BIC = 0
 135
 136     #assert top2_bq_T.shape[0] + top2_bq_C.shape[0] > 0, "Total number of top2 nt should not be zero while using this function: CalModel_Heter_AS!"
 137
 138     # Treatment
 139     tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
 140
 141     if tn_T == 0:
 142         raise Exception("Total number of treatment reads is 0!")
 143     else:
 144         ( lnL_T, k_T, AS_alleleratio ) = GreedyMaxFunctionAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T, max_allowed_ar)
 145         #print ">>>",lnL_T, k_T, AS_alleleratio
 146         lnL += lnL_T
 147         BIC += -2*lnL_T
 148
 149     # control
 150     tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
 151
 152     if tn_C == 0:
 153         pass
 154     else:
 155         # We assume control will not have allele preference
 156         ( lnL_C, k_C ) = GreedyMaxFunctionNoAS ( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C)
 157         lnL += lnL_C
 158         BIC += -2*lnL_C
 159
 160     tn = tn_C + tn_T
 161
 162     # we penalize big model depending on the number of reads/samples
 163     if tn_T == 0:
 164         BIC += log( tn_C )
 165     elif tn_C == 0:
 166         BIC += 2 * log( tn_T )
 167     else:
 168         BIC += 2 * log( tn_T ) + log( tn_C )
 169
 170     return (lnL, BIC)
 171
 172
 173 @cython.boundscheck(False) # turn off bounds-checking for entire function
 174 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 175 cdef tuple GreedyMaxFunctionAS( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, float max_allowed_ar = 0.99 ):
 176     """Return lnL, k and alleleratio in tuple.
 177
 178     Note: I only translate Liqing's C++ code into pyx here. Haven't done any review.
 179     """
 180     cdef:
 181         double dnew, dold, rold, rnew
 182         int kold, knew
 183         bool btemp
 184         int k0
 185         double dl, dr, d0, d1l, d1r
 186
 187     assert m+n == tn
 188     btemp = False
 189     if tn == 1:                 # only 1 read; I don't expect this to be run...
 190         dl=calculate_ln(m,n,tn,me,ne,0,0);
 191         dr=calculate_ln(m,n,tn,me,ne,1,1);
 192
 193         if dl>dr:
 194             k = 0
 195             return ( dl, 0, 0 )
 196         else:
 197             k = 1
 198             return ( dr, 1, 1 )
 199     elif m == 0:                          #no top1 nt
 200         return ( calculate_ln( m, n, tn, me, ne, 0, m, max_allowed_ar ), m, 1-max_allowed_ar )
 201         #k0 = m + 1
 202     elif m == tn:                         #all reads are top1
 203         return ( calculate_ln( m, n, tn, me, ne, 1, m, max_allowed_ar ), m, max_allowed_ar )
 204     else:
 205         k0 = m
 206
 207     d0  = calculate_ln( m, n, tn, me, ne, float(k0)/tn, k0, max_allowed_ar )
 208     d1l = calculate_ln( m, n, tn, me, ne, float(k0-1)/tn, k0-1, max_allowed_ar )
 209     d1r = calculate_ln( m, n, tn, me, ne, float(k0+1)/tn, k0+1, max_allowed_ar )
 210
 211     if d0 > d1l-1e-8 and d0 > d1r-1e-8:
 212         k = k0
 213         ar = float(k0)/tn
 214         return ( d0, k, ar )
 215     elif d1l > d0:
 216         dold = d1l
 217         kold = k0-1
 218         rold = float(k0-1)/tn
 219         while kold > 1: #disable: when kold=1 still run, than knew=0 is the final run
 220             knew = kold - 1
 221             rnew = float(knew)/tn
 222
 223             dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar )
 224
 225             if(dnew-1e-8 < dold) :
 226                 btemp=True
 227                 break
 228             kold=knew
 229             dold=dnew
 230             rold=rnew
 231
 232         if btemp: #maximum L value is in [1,m-1];
 233             k = kold
 234             ar= rold
 235             return ( dold, k, ar )
 236         else: #L(k=0) is the max for [0,m-1]
 237             k = kold
 238             ar = rold
 239             return ( dold, k, ar )
 240
 241     elif d1r > d0:
 242         dold = d1r
 243         kold = k0 + 1
 244         rold = float(k0 + 1)/tn
 245         while kold < tn - 1: #//disable: when kold=tn-1 still run, than knew=tn is the final run
 246             knew = kold + 1
 247
 248             rnew = float(knew)/tn
 249
 250             dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar )
 251
 252             if dnew - 1e-8 < dold:
 253                 btemp = True
 254                 break
 255             kold = knew
 256             dold = dnew
 257             rold = rnew
 258
 259         if btemp: #maximum L value is in [m+1,tn-1]
 260             k = kold
 261             ar= rold
 262             return ( dold, k, ar )
 263         else: #L(k=tn) is the max for [m+1,tn]
 264             k = kold
 265             ar = rold
 266             return ( dold, k, ar )
 267     else:
 268         raise Exception("error in GreedyMaxFunctionAS")
 269
 270
 271 @cython.boundscheck(False) # turn off bounds-checking for entire function
 272 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 273 cdef tuple GreedyMaxFunctionNoAS (int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne ):
 274     """Return lnL, and k in tuple.
 275
 276     Note: I only translate Liqing's C++ code into pyx here. Haven't done any review.
 277     """
 278     cdef:
 279         double dnew, dold
 280         int kold, knew
 281         bool btemp
 282         int k0
 283         double bg_r, dl, dr, d0, d1l, d1r
 284
 285     btemp = False
 286     bg_r = 0.5
 287
 288     if tn == 1:
 289         dl = calculate_ln( m, n, tn, me, ne, bg_r, 0)
 290         dr= calculate_ln( m, n, tn, me, ne, bg_r, 1)
 291         if dl > dr:
 292             k = 0
 293             return ( dl, 0 )
 294         else:
 295             k = 1
 296             return ( dr, 1 )
 297     elif m == 0:                          #no top1 nt
 298         return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m )
 299         #k0 = m + 1
 300     elif m == tn:                         #all reads are top1
 301         return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m )
 302     #elif m == 0:
 303     #    k0 = m + 1
 304     #elif m == tn:
 305     #    k0 = m - 1
 306     else:
 307         k0 = m
 308
 309     d0  = calculate_ln( m, n, tn, me, ne, bg_r, k0)
 310     d1l = calculate_ln( m, n, tn, me, ne, bg_r, k0 - 1)
 311     d1r = calculate_ln( m, n, tn, me, ne, bg_r, k0 + 1)
 312
 313     if d0 > d1l - 1e-8 and d0 > d1r - 1e-8:
 314         k = k0
 315         return ( d0, k )
 316     elif d1l > d0:
 317         dold = d1l
 318         kold=k0 - 1
 319         while kold >= 1:   #//when kold=1 still run, than knew=0 is the final run
 320             knew = kold - 1
 321             dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew )
 322             if dnew - 1e-8 < dold:
 323                 btemp = True
 324                 break
 325             kold=knew
 326             dold=dnew
 327
 328         if btemp: #//maximum L value is in [1,m-1];
 329             k = kold
 330             return ( dold, k )
 331         else: #//L(k=0) is the max for [0,m-1]
 332             k = kold
 333             return ( dold, k )
 334     elif d1r > d0:
 335         dold = d1r
 336         kold = k0 + 1
 337         while kold <= tn - 1: #//when kold=tn-1 still run, than knew=tn is the final run
 338             knew = kold + 1
 339             dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew )
 340             if dnew - 1e-8 < dold:
 341                 btemp = True
 342                 break
 343             kold = knew
 344             dold = dnew
 345
 346         if btemp: #//maximum L value is in [m+1,tn-1]
 347             k = kold
 348             return ( dold, k )
 349         else: #//L(k=tn) is the max for [m+1,tn]
 350             k = kold
 351             return ( dold, k )
 352     else:
 353         raise Exception("error in GreedyMaxFunctionNoAS")
 354
 355 @cython.boundscheck(False) # turn off bounds-checking for entire function
 356 @cython.wraparound(False)  # turn off negative index wrapping for entire function
 357 cdef calculate_ln( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, double r, int k, float max_allowed_r = 0.99):
 358     """Calculate log likelihood given quality of top1 and top2, the ratio r and the observed k.
 359
 360     """
 361     cdef:
 362         int i
 363         double lnL
 364         double e
 365
 366     lnL = 0
 367
 368     if r > max_allowed_r or r < 1 - max_allowed_r: # r is extremely high or
 369         #print "here1"
 370         lnL += k*log( max_allowed_r ) + (tn-k)*log( 1- max_allowed_r) #-10000
 371     else:
 372         lnL += k*log( r ) + (tn-k)*log(1-r)
 373
 374     if k == 0 or k == tn:       # it's entirely biased toward 1 allele
 375         #print "here2"
 376         pass
 377         #lnL += k*log( max_allowed_r ) #-10000
 378         #lnL += -10000
 379     elif k <= tn/2:
 380         for i in range( k ):
 381             lnL += log(float(tn-i)/(k-i))
 382     else:
 383         for i in range( tn-k ):
 384             lnL += log(float(tn-i)/(tn-k-i))
 385
 386     for i in range( m ):
 387         e = exp( - me[ i ] * LN10_tenth )
 388         lnL += log((1-e)*(float(k)/tn) + e*(1-float(k)/tn))
 389
 390     for i in range( n ):
 391         e = exp( - ne[ i ] * LN10_tenth )
 392         lnL += log((1-e)*(1-float(k)/tn) + e*(float(k)/tn))
 393
 394     #print r,k,lnL
 395     return lnL
 396
 397 cpdef int calculate_GQ ( double lnL1, double lnL2, double lnL3):
 398     """GQ1 = -10*log_{10}((L2+L3)/(L1+L2+L3))
 399
 400
 401     """
 402     cdef:
 403         double L1, L2, L3, sum, tmp
 404         int GQ_score
 405
 406     #L1 = exp(lnL1-lnL1)
 407     L1 = 1
 408     L2 = exp(lnL2-lnL1)
 409     L3 = exp(lnL3-lnL1)
 410
 411     #if L1 > 1:
 412     #    L1 = 1
 413
 414     if L2 > 1:
 415         L2 = 1
 416     if L3 > 1:
 417         L3 = 1
 418     #if(L1<1e-110) L1=1e-110;
 419     if L2 < 1e-110:
 420         L2=1e-110
 421     if L3 < 1e-110:
 422         L3 = 1e-110
 423
 424     sum = L1 + L2 + L3
 425     tmp = ( L2 + L3 )/sum
 426     if tmp > 1e-110:
 427         GQ_score = (int)(-4.34294*log(tmp))
 428     else:
 429         GQ_score = 255
 430
 431     return GQ_score;
 432
 433 cpdef int calculate_GQ_heterASsig( double lnL1, double lnL2):
 434     """
 435     """
 436     cdef:
 437         double L1, L2, sum, tmp
 438         int ASsig_score
 439
 440     #L1=exp(2.7182818,lnL1-lnL1)
 441     L1 = 1
 442     L2 = exp( lnL2 - lnL1 )
 443
 444     #if L1 > 1:
 445     #    L1 = 1
 446     if L2 > 1:
 447         L2 = 1
 448     #if L1 < 1e-110:
 449     #    L1 = 1e-110
 450     if L2 < 1e-110:
 451         L2 = 1e-110
 452
 453     sum = L1 + L2
 454     tmp = L2/sum
 455     if tmp > 1e-110:
 456         ASsig_score = (int)(-4.34294*log(tmp))
 457     else:
 458         ASsig_score = 255
 459
 460     return ASsig_score
 461