MACS3/Signal/ReadAlignment.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2021-03-10 16:21:51 Tao Liu>
   4
   5 """Module for SAPPER ReadAlignment class
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file COPYING included
   9 with the distribution).
  10 """
  11
  12 # ------------------------------------
  13 # python modules
  14 # ------------------------------------
  15 from cpython cimport bool
  16
  17 cdef extern from "stdlib.h":
  18     ctypedef unsigned int size_t
  19     size_t strlen(char *s)
  20     void *malloc(size_t size)
  21     void *calloc(size_t n, size_t size)
  22     void free(void *ptr)
  23     int strcmp(char *a, char *b)
  24     char * strcpy(char *a, char *b)
  25     long atol(char *bytes)
  26     int atoi(char *bytes)
  27
  28 # ------------------------------------
  29 # constants
  30 # ------------------------------------
  31
  32 __BAMDNACODE__ = b"=ACMGRSVTWYHKDBN"
  33 __CIGARCODE__ = "MIDNSHP=X"
  34 __DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
  35
  36 # -- CIGAR CODE --
  37 #OP BAM  Description
  38 #M  0    alignment match (can be a sequence match or mismatch) insertion to the reference
  39 #I  1    insertion to the reference
  40 #D  2    deletion from the reference
  41 #N  3    skipped region from the reference
  42 #S  4    soft clipping (clipped sequences present in SEQ)
  43 #H  5    hard clipping (clipped sequences NOT present in SEQ)
  44 #P  6    padding (silent deletion from padded reference)
  45 #=  7    sequence match
  46 #X  8    sequence mismatch
  47 # -- -- -- -- -- --
  48
  49 # ------------------------------------
  50 # Misc functions
  51 # ------------------------------------
  52
  53 # ------------------------------------
  54 # Classes
  55 # ------------------------------------
  56 cdef class ReadAlignment:
  57     cdef:
  58         bytes readname
  59         bytes chrom
  60         int lpos
  61         int rpos
  62         int strand              # strand information. 0 means forward strand, 1 means reverse strand.
  63         bytes binaryseq
  64         bytes binaryqual
  65         int l                   # length of read
  66         tuple cigar             # each item contains op_l|op
  67         bytes MD
  68         int n_edits             # number of edits; higher the number,
  69                                 # more differences with reference
  70         bytes SEQ               # sequence of read regarding to + strand
  71         bytes QUAL              # quality of read regarding to + strand
  72
  73     def __init__ ( self,
  74                    bytes readname,
  75                    bytes chrom, int lpos, int rpos,
  76                    int strand,
  77                    bytes binaryseq,
  78                    bytes binaryqual,
  79                    tuple cigar,
  80                    bytes MD ):
  81         self.readname = readname
  82         self.chrom = chrom
  83         self.lpos = lpos
  84         self.rpos = rpos
  85         self.strand = strand
  86         self.binaryseq = binaryseq
  87         self.binaryqual = binaryqual
  88         self.l = len( binaryqual )
  89         self.cigar = cigar
  90         self.MD = MD
  91         self.n_edits = self.get_n_edits()
  92         (self.SEQ, self.QUAL) = self.__get_SEQ_QUAL()
  93
  94     cdef int get_n_edits( self ):
  95         """The number is from self.cigar and self.MD.
  96
  97         """
  98         cdef:
  99             int n_edits
 100             int i, cigar_op, cigar_op_l
 101             char c
 102
 103         n_edits = 0
 104         for i in self.cigar:    # only count insertion or softclip
 105             cigar_op = i & 15
 106             cigar_op_l = i >> 4
 107             if cigar_op in [ 1, 4 ]:    # count Insertion or Softclip
 108                 n_edits += cigar_op_l
 109
 110         for c in self.MD:
 111             if (c > 64 and c < 91): # either deletion in query or mismatch
 112                 n_edits += 1
 113         return n_edits
 114
 115     def __str__ ( self ):
 116         c = self.chrom.decode()
 117         n = self.readname.decode()
 118         if self.strand:
 119             s = "-"
 120         else:
 121             s = "+"
 122         return f"{c}\t{self.lpos}\t{self.rpos}\t{n}\t{self.l}\t{s}"
 123
 124     def __getitem__ ( self, keyname ):
 125         if keyname == "readname":
 126             return self.readname
 127         elif keyname == "chrom":
 128             return self.chrom
 129         elif keyname == "lpos":
 130             return self.lpos
 131         elif keyname == "rpos":
 132             return self.rpos
 133         elif keyname == "strand":
 134             return self.strand
 135         elif keyname == "SEQ":
 136             return self.SEQ
 137         elif keyname == "QUAL":
 138             return self.QUAL
 139         elif keyname == "n_edits":
 140             return self.n_edits
 141         elif keyname == "binaryseq":
 142             return self.binaryseq
 143         elif keyname == "binaryqual":
 144             return self.binaryqual
 145         elif keyname == "l":
 146             return self.l
 147         elif keyname == "cigar":
 148             return self.cigar
 149         elif keyname == "MD":
 150             return self.MD
 151         else:
 152             raise KeyError("No such key", keyname)
 153
 154     def __getstate__ ( self ):
 155         return ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL )
 156
 157     def __setstate__ ( self, state ):
 158         ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL ) = state
 159
 160     # cpdef bytearray get_SEQ ( self ):
 161     #     """Convert binary seq to ascii seq.
 162
 163     #     Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
 164
 165     #     Note: In BAM, if a sequence is mapped to reverse strand, the
 166     #     reverse complement seq is written in SEQ field. So the return
 167     #     value of this function will not be the original one if the
 168     #     read is mapped to - strand.
 169     #     """
 170     #     cdef:
 171     #         char c
 172     #         bytearray seq
 173
 174     #     seq = bytearray(b"")
 175     #     for c in self.binaryseq:
 176     #         # high
 177     #         seq.append( __BAMDNACODE__[c >> 4 & 15] )
 178     #         # low
 179     #         seq.append( __BAMDNACODE__[c & 15] )
 180     #     if seq[-1] == b"=":
 181     #         # trim the last '=' if it exists
 182     #         seq = seq[:-1]
 183     #     return seq
 184
 185     cdef tuple __get_SEQ_QUAL ( self ):
 186         """Get tuple of (SEQ, QUAL).
 187
 188         Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
 189
 190         Note: In BAM, if a sequence is mapped to reverse strand, the
 191         reverse complement seq is written in SEQ field. So the return
 192         value of this function will not be the original one if the
 193         read is mapped to - strand. If you need to original one, do
 194         reversecomp for SEQ and reverse QUAL.
 195         """
 196         cdef:
 197             int i
 198             char c
 199             bytearray seq
 200             bytearray qual
 201
 202         seq = bytearray(b"")
 203         qual = bytearray(b"")
 204
 205         for i in range( len(self.binaryseq) ):
 206             c = self.binaryseq[ i ]
 207             # high
 208             seq.append( __BAMDNACODE__[c >> 4 & 15] )
 209             # low
 210             seq.append( __BAMDNACODE__[c & 15] )
 211
 212         for i in range( len( self.binaryqual ) ):
 213             # qual is the -10log10 p or phred score.
 214             qual.append( self.binaryqual[i] )
 215
 216         if seq[-1] == b"=":
 217             # trim the last '=' if it exists
 218             seq = seq[:-1]
 219         assert len( seq ) == len( qual ), Exception("Lengths of seq and qual are not consistent!")
 220
 221         # Example on how to get original SEQ and QUAL:
 222         #if self.strand:
 223         #    seq.reverse()
 224         #    #compliment
 225         #    seq = seq.translate( __DNACOMPLEMENT__ )
 226         #    qual.reverse()
 227
 228         return ( bytes(seq), bytes(qual) )
 229
 230
 231     cpdef bytes get_FASTQ ( self ):
 232         """Get FASTQ format text.
 233
 234         """
 235         cdef:
 236             bytes seq
 237             bytearray qual
 238
 239         seq = self.SEQ
 240         qual = bytearray(self.QUAL)
 241
 242         for i in range( len( self.QUAL ) ):
 243             # qual is the -10log10 p or phred score, to make FASTQ, we have to add 33
 244             qual[ i ] += 33
 245
 246         # reverse while necessary
 247         if self.strand:
 248             seq = self.SEQ[::-1]
 249             #compliment
 250             seq = seq.translate( __DNACOMPLEMENT__ )
 251             qual = qual[::-1]
 252         else:
 253             seq = self.SEQ
 254
 255         return b"@" + self.readname + b"\n" + seq + b"\n+\n" + qual + b"\n"
 256
 257     cpdef bytearray get_REFSEQ ( self ):
 258         """Fetch reference sequence, using self.MD and self.cigar
 259         """
 260         cdef:
 261             char c
 262             bytearray seq, refseq
 263             int i, cigar_op, cigar_op_l
 264             bytearray MD_op
 265             int ind
 266             bool flag_del       # flag for deletion event in query
 267
 268         seq = bytearray(self.SEQ)    # we start with read seq then make modifications
 269
 270         # 2-step proces
 271         # First step: use CIGAR to edit SEQ to remove S (softclip) and I (insert)
 272         # __CIGARCODE__ = "MIDNSHP=X"
 273         # let ind be the index in SEQ
 274         ind = 0
 275         for i in self.cigar:
 276             cigar_op = i & 15
 277             cigar_op_l = i >> 4
 278             if cigar_op in [2, 5, 6]:     # do nothing for Deletion (we will
 279                                           # put sequence back in step 2),
 280                                           # Hardclip and Padding
 281                 pass
 282             elif cigar_op in [0, 7, 8]:   # M = X alignment match (match or
 283                                           # mismatch)
 284                 # do nothing and move ind
 285                 ind += cigar_op_l
 286             elif cigar_op in [ 1, 4 ]:    # Remove for Insertion or Softclip
 287                 seq[ ind : ind + cigar_op_l ] = b''
 288
 289         # now the seq should be at the same length as rpos-lpos
 290
 291         # Second step: use MD string to edit SEQ to put back 'deleted
 292         # seqs' and modify mismatches
 293
 294         # let ind be the index in SEQ again, from 0
 295         ind = 0
 296         MD_op = bytearray(b'')
 297         flag_del = False
 298         for c in self.MD:
 299             if c < 58 and c > 47:
 300                 # means Match
 301                 flag_del = False
 302                 MD_op.append(c)
 303             elif (c > 64 and c < 91) and not flag_del:
 304                 # An alphabet means Mismatch, Note, if MD is made
 305                 # right, a mismatch should only be 1 letter surrounded
 306                 # by digits.
 307                 ind += int(MD_op)
 308                 seq[ ind ] = c
 309                 ind += 1
 310                 # reset MD_op
 311                 MD_op = bytearray(b'')
 312             elif (c > 64 and c < 91) and flag_del:
 313                 seq[ ind:ind ] = [c,]
 314                 ind += 1
 315             elif c == 94:
 316                 # means Deletion in query. Now, insert a sequnce into
 317                 # SEQ
 318                 flag_del = True
 319                 ind += int(MD_op)
 320                 # reset MD_op
 321                 MD_op = bytearray(b'')
 322             else:
 323                 raise Exception("Don't understand this operator in MD: %c" % c)
 324             #print( seq.decode() )
 325
 326         return seq
 327
 328     cpdef get_base_by_ref_pos ( self, long ref_pos ):
 329         """Get base by ref position.
 330
 331         """
 332         cdef:
 333            int relative_pos, p
 334         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
 335         relative_pos = ref_pos - self.lpos
 336         p = self.relative_ref_pos_to_relative_query_pos( relative_pos )
 337
 338         if p == -1:             # located in a region deleted in query
 339             return None
 340         else:
 341             return __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ]
 342
 343     cpdef get_bq_by_ref_pos ( self, long ref_pos ):
 344         """Get base quality by ref position. Base quality is in Phred scale.
 345
 346         Returned value is the raw Phred-scaled base quality.
 347
 348         """
 349         cdef:
 350            int relative_pos, p
 351         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
 352         relative_pos = ref_pos - self.lpos
 353         p = self.relative_ref_pos_to_relative_query_pos( relative_pos )
 354
 355         if p == -1:             # located in a region deleted in query
 356             return None
 357         else:
 358             return self.binaryqual[p]
 359
 360     cpdef tuple get_base_bq_by_ref_pos ( self, long ref_pos ):
 361         """Get base and base quality by ref position. Base quality is in Phred scale.
 362
 363         Returned bq is the raw Phred-scaled base quality.
 364         """
 365         cdef:
 366            int relative_pos, p
 367         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
 368         relative_pos = ref_pos - self.lpos
 369         p = self.relative_ref_pos_to_relative_query_pos( relative_pos )
 370
 371         if p == -1:             # located in a region deleted in query
 372             return None
 373         else:
 374             return ( __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ], self.binaryqual[p] )
 375
 376     cpdef tuple get_variant_bq_by_ref_pos ( self, long ref_pos ):
 377         """Get any variants (different with reference) and base quality by ref position.
 378
 379         variants will be
 380
 381         1) =, if identical
 382
 383         2) A/T/C/G, if SNV
 384
 385         3) -, if the reference base is deleted, in this case, bq will
 386         be the highest possible bq, which is 93.
 387
 388         4) ^<A/T/C/G>+, if there is an insertion at the location
 389
 390         Base quality is the raw Phred-scaled base quality.
 391
 392         """
 393         cdef:
 394            int i, m, n
 395            int res, p, op, op_l
 396            int pos
 397            bool tip
 398            bytearray refseq
 399            bytes p_refseq, p_seq
 400            bytearray seq_array
 401            bytearray bq_array
 402
 403         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
 404
 405         res = ref_pos - self.lpos          # residue
 406         p = 0
 407
 408         refseq = self.get_REFSEQ()
 409         p_refseq =  refseq[ res ]
 410         # -- CIGAR CODE --
 411         #OP BAM  Description
 412         #M  0    alignment match (can be a sequence match or mismatch) insertion to the reference
 413         #I  1    insertion to the reference
 414         #D  2    deletion from the reference
 415         #N  3    skipped region from the reference
 416         #S  4    soft clipping (clipped sequences present in SEQ)
 417         #H  5    hard clipping (clipped sequences NOT present in SEQ)
 418         #P  6    padding (silent deletion from padded reference)
 419         #=  7    sequence match
 420         #X  8    sequence mismatch
 421
 422         seq_array = bytearray( b'' )
 423         bq_array = bytearray( b'' )
 424
 425         for m in range( len(self.cigar) ):
 426             i = self.cigar[ m ]
 427             op = i & 15
 428             op_l = i >> 4
 429             if op in [0, 7, 8]:         # M = X alignment match (match or mismatch)
 430                 if res < op_l - 1:
 431                     # in the range of a CIGAR operator
 432                     p += res
 433                     # find the position, now get the ref
 434                     pos = p
 435                     seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
 436                     bq_array.append( self.binaryqual[ p ] )
 437                     break
 438                 elif res == op_l - 1:
 439                     p += res
 440                     pos = p
 441                     seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
 442                     bq_array.append( self.binaryqual[ p ] )
 443                     # now add any insertion later on
 444                     # get next cigar
 445                     if m + 1 == len( self.cigar ):
 446                         break
 447                     i = self.cigar[ m + 1 ]
 448                     op = i & 15
 449                     op_l = i >> 4
 450                     if op == 1:           #insertion
 451                         for n in range( op_l ):
 452                             p += 1
 453                             seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
 454                             bq_array.append( self.binaryqual[ p ] )
 455                         #print self.SEQ, seq_array
 456                     break
 457                 else:
 458                     # go to the next cigar code
 459                     p += op_l
 460                     res -= op_l
 461             elif op in [ 2, 3 ]: # D N
 462                 if res < op_l:
 463                     # find the position, however ...
 464                     # position located in a region in reference that not exists in query
 465                     pos = p
 466                     seq_array.append( b'*' )
 467                     bq_array.append( 93 )   #assign 93 for deletion
 468                     break
 469                 else:
 470                     # go to the next cigar code
 471                     res -= op_l
 472             elif op == 1 :      # Insertion
 473                 p += op_l
 474                 # if res == 0:    # no residue left, so return a chunk of inserted sequence
 475                 #     print "shouldn't run this code"
 476                 #     # first, add the insertion point
 477                 #     seq_array = bytearray( b'~' )
 478                 #     bq_array.append( self.binaryqual[ p ] )
 479                 #     # then add the inserted seq
 480                 #     for i in range( op_l ):
 481                 #         p += 1
 482                 #         seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ]  )
 483                 #         bq_array.append( self.binaryqual[ p ] )
 484                 #     break
 485                 # else:
 486                 #     p += op_l
 487             elif op == 4 :      # Softclip. If it's Softclip, we'd better not return the extra seq
 488                 p += op_l
 489
 490         if pos == 0 or pos == self.l - 1:
 491             tip = True
 492         else:
 493             tip = False
 494
 495         return ( seq_array, bq_array, self.strand, tip, pos )
 496         # last position ?
 497         #raise Exception("Not expected to see this")
 498
 499     cdef int relative_ref_pos_to_relative_query_pos ( self, long relative_ref_pos ):
 500         """Convert relative pos on ref to pos on query.
 501         """
 502         cdef:
 503             int p, res, op, op_l
 504         p = 0
 505         res = relative_ref_pos
 506
 507         for i in self.cigar:
 508             op = i & 15
 509             op_l = i >> 4
 510             if op in [0, 7, 8]:         # M = X alignment match (match or mismatch)
 511                 if res < op_l:
 512                     p += res
 513                     return p
 514                 else:
 515                     p += op_l
 516                     res -= op_l
 517             elif op in [ 2, 3 ]: # D N
 518                 if res < op_l:
 519                     # position located in a region in reference that not exists in query
 520                     return -1
 521                 else:
 522                     res -= op_l
 523             elif op in [ 1, 4 ]:       # I
 524                 p += op_l
 525         return p
 526
 527
 528 ### End ###
 529