1 # cython: language_level=3
3 # Time-stamp: <2021-03-10 16:21:51 Tao Liu>
5 """Module for SAPPER ReadAlignment class
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file COPYING included
9 with the distribution).
12 # ------------------------------------
14 # ------------------------------------
15 from cpython cimport bool
17 cdef extern from "stdlib.h":
18 ctypedef unsigned
int size_t
19 size_t strlen
(char *s
)
20 void *malloc
(size_t size
)
21 void *calloc
(size_t n
, size_t size
)
23 int strcmp
(char *a
, char *b
)
24 char * strcpy
(char *a
, char *b
)
25 long atol
(char *bytes
)
28 # ------------------------------------
30 # ------------------------------------
32 __BAMDNACODE__
= b
"=ACMGRSVTWYHKDBN"
33 __CIGARCODE__
= "MIDNSHP=X"
34 __DNACOMPLEMENT__
= b
'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
38 #M 0 alignment match (can be a sequence match or mismatch) insertion to the reference
39 #I 1 insertion to the reference
40 #D 2 deletion from the reference
41 #N 3 skipped region from the reference
42 #S 4 soft clipping (clipped sequences present in SEQ)
43 #H 5 hard clipping (clipped sequences NOT present in SEQ)
44 #P 6 padding (silent deletion from padded reference)
46 #X 8 sequence mismatch
49 # ------------------------------------
51 # ------------------------------------
53 # ------------------------------------
55 # ------------------------------------
56 cdef class ReadAlignment
:
62 int strand
# strand information. 0 means forward strand, 1 means reverse strand.
65 int l
# length of read
66 tuple cigar
# each item contains op_l|op
68 int n_edits
# number of edits; higher the number,
69 # more differences with reference
70 bytes SEQ
# sequence of read regarding to + strand
71 bytes QUAL
# quality of read regarding to + strand
75 bytes chrom
, int lpos
, int rpos
,
81 self.readname
= readname
86 self.binaryseq
= binaryseq
87 self.binaryqual
= binaryqual
88 self.l
= len( binaryqual
)
91 self.n_edits
= self.get_n_edits
()
92 (self.SEQ
, self.QUAL
) = self.__get_SEQ_QUAL
()
94 cdef int get_n_edits
( self ):
95 """The number is from self.cigar and self.MD.
100 int i
, cigar_op
, cigar_op_l
104 for i
in self.cigar
: # only count insertion or softclip
107 if cigar_op
in [ 1, 4 ]: # count Insertion or Softclip
108 n_edits
+= cigar_op_l
111 if (c
> 64 and c
< 91): # either deletion in query or mismatch
115 def __str__
( self ):
116 c
= self.chrom
.decode
()
117 n
= self.readname
.decode
()
122 return f
"{c}\t{self.lpos}\t{self.rpos}\t{n}\t{self.l}\t{s}"
124 def __getitem__
( self, keyname
):
125 if keyname
== "readname":
127 elif keyname
== "chrom":
129 elif keyname
== "lpos":
131 elif keyname
== "rpos":
133 elif keyname
== "strand":
135 elif keyname
== "SEQ":
137 elif keyname
== "QUAL":
139 elif keyname
== "n_edits":
141 elif keyname
== "binaryseq":
142 return self.binaryseq
143 elif keyname
== "binaryqual":
144 return self.binaryqual
147 elif keyname
== "cigar":
149 elif keyname
== "MD":
152 raise KeyError("No such key", keyname
)
154 def __getstate__
( self ):
155 return ( self.readname
, self.chrom
, self.lpos
, self.rpos
, self.strand
, self.binaryseq
, self.binaryqual
, self.l
, self.cigar
, self.MD
, self.n_edits
, self.SEQ
, self.QUAL
)
157 def __setstate__
( self, state
):
158 ( self.readname
, self.chrom
, self.lpos
, self.rpos
, self.strand
, self.binaryseq
, self.binaryqual
, self.l
, self.cigar
, self.MD
, self.n_edits
, self.SEQ
, self.QUAL
) = state
160 # cpdef bytearray get_SEQ ( self ):
161 # """Convert binary seq to ascii seq.
163 # Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
165 # Note: In BAM, if a sequence is mapped to reverse strand, the
166 # reverse complement seq is written in SEQ field. So the return
167 # value of this function will not be the original one if the
168 # read is mapped to - strand.
174 # seq = bytearray(b"")
175 # for c in self.binaryseq:
177 # seq.append( __BAMDNACODE__[c >> 4 & 15] )
179 # seq.append( __BAMDNACODE__[c & 15] )
180 # if seq[-1] == b"=":
181 # # trim the last '=' if it exists
185 cdef tuple __get_SEQ_QUAL
( self ):
186 """Get tuple of (SEQ, QUAL).
188 Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
190 Note: In BAM, if a sequence is mapped to reverse strand, the
191 reverse complement seq is written in SEQ field. So the return
192 value of this function will not be the original one if the
193 read is mapped to - strand. If you need to original one, do
194 reversecomp for SEQ and reverse QUAL.
203 qual
= bytearray
(b
"")
205 for i
in range( len(self.binaryseq
) ):
206 c
= self.binaryseq
[ i
]
208 seq
.append
( __BAMDNACODE__
[c
>> 4 & 15] )
210 seq
.append
( __BAMDNACODE__
[c
& 15] )
212 for i
in range( len( self.binaryqual
) ):
213 # qual is the -10log10 p or phred score.
214 qual
.append
( self.binaryqual
[i
] )
217 # trim the last '=' if it exists
219 assert len( seq
) == len( qual
), Exception("Lengths of seq and qual are not consistent!")
221 # Example on how to get original SEQ and QUAL:
225 # seq = seq.translate( __DNACOMPLEMENT__ )
228 return ( bytes
(seq
), bytes
(qual
) )
231 cpdef bytes get_FASTQ
( self ):
232 """Get FASTQ format text.
240 qual
= bytearray
(self.QUAL
)
242 for i
in range( len( self.QUAL
) ):
243 # qual is the -10log10 p or phred score, to make FASTQ, we have to add 33
246 # reverse while necessary
250 seq
= seq
.translate
( __DNACOMPLEMENT__
)
255 return b
"@" + self.readname
+ b
"\n" + seq
+ b
"\n+\n" + qual
+ b
"\n"
257 cpdef bytearray get_REFSEQ
( self ):
258 """Fetch reference sequence, using self.MD and self.cigar
262 bytearray seq
, refseq
263 int i
, cigar_op
, cigar_op_l
266 bool flag_del
# flag for deletion event in query
268 seq
= bytearray
(self.SEQ
) # we start with read seq then make modifications
271 # First step: use CIGAR to edit SEQ to remove S (softclip) and I (insert)
272 # __CIGARCODE__ = "MIDNSHP=X"
273 # let ind be the index in SEQ
278 if cigar_op
in [2, 5, 6]: # do nothing for Deletion (we will
279 # put sequence back in step 2),
280 # Hardclip and Padding
282 elif cigar_op
in [0, 7, 8]: # M = X alignment match (match or
284 # do nothing and move ind
286 elif cigar_op
in [ 1, 4 ]: # Remove for Insertion or Softclip
287 seq
[ ind
: ind
+ cigar_op_l
] = b
''
289 # now the seq should be at the same length as rpos-lpos
291 # Second step: use MD string to edit SEQ to put back 'deleted
292 # seqs' and modify mismatches
294 # let ind be the index in SEQ again, from 0
296 MD_op
= bytearray
(b
'')
299 if c
< 58 and c
> 47:
303 elif (c
> 64 and c
< 91) and not flag_del
:
304 # An alphabet means Mismatch, Note, if MD is made
305 # right, a mismatch should only be 1 letter surrounded
311 MD_op
= bytearray
(b
'')
312 elif (c
> 64 and c
< 91) and flag_del
:
313 seq
[ ind
:ind
] = [c
,]
316 # means Deletion in query. Now, insert a sequnce into
321 MD_op
= bytearray
(b
'')
323 raise Exception("Don't understand this operator in MD: %c" % c
)
324 #print( seq.decode() )
328 cpdef get_base_by_ref_pos
( self, long ref_pos
):
329 """Get base by ref position.
334 assert self.lpos
<= ref_pos
and self.rpos
> ref_pos
, Exception("Given position out of alignment location")
335 relative_pos
= ref_pos
- self.lpos
336 p
= self.relative_ref_pos_to_relative_query_pos
( relative_pos
)
338 if p
== -1: # located in a region deleted in query
341 return __BAMDNACODE__
[ (self.binaryseq
[p
//2] >> ((1-p
%2)*4) ) & 15 ]
343 cpdef get_bq_by_ref_pos
( self, long ref_pos
):
344 """Get base quality by ref position. Base quality is in Phred scale.
346 Returned value is the raw Phred-scaled base quality.
351 assert self.lpos
<= ref_pos
and self.rpos
> ref_pos
, Exception("Given position out of alignment location")
352 relative_pos
= ref_pos
- self.lpos
353 p
= self.relative_ref_pos_to_relative_query_pos
( relative_pos
)
355 if p
== -1: # located in a region deleted in query
358 return self.binaryqual
[p
]
360 cpdef
tuple get_base_bq_by_ref_pos
( self, long ref_pos
):
361 """Get base and base quality by ref position. Base quality is in Phred scale.
363 Returned bq is the raw Phred-scaled base quality.
367 assert self.lpos
<= ref_pos
and self.rpos
> ref_pos
, Exception("Given position out of alignment location")
368 relative_pos
= ref_pos
- self.lpos
369 p
= self.relative_ref_pos_to_relative_query_pos
( relative_pos
)
371 if p
== -1: # located in a region deleted in query
374 return ( __BAMDNACODE__
[ (self.binaryseq
[p
//2] >> ((1-p
%2)*4) ) & 15 ], self.binaryqual
[p
] )
376 cpdef
tuple get_variant_bq_by_ref_pos
( self, long ref_pos
):
377 """Get any variants (different with reference) and base quality by ref position.
385 3) -, if the reference base is deleted, in this case, bq will
386 be the highest possible bq, which is 93.
388 4) ^<A/T/C/G>+, if there is an insertion at the location
390 Base quality is the raw Phred-scaled base quality.
399 bytes p_refseq
, p_seq
403 assert self.lpos
<= ref_pos
and self.rpos
> ref_pos
, Exception("Given position out of alignment location")
405 res
= ref_pos
- self.lpos
# residue
408 refseq
= self.get_REFSEQ
()
409 p_refseq
= refseq
[ res
]
412 #M 0 alignment match (can be a sequence match or mismatch) insertion to the reference
413 #I 1 insertion to the reference
414 #D 2 deletion from the reference
415 #N 3 skipped region from the reference
416 #S 4 soft clipping (clipped sequences present in SEQ)
417 #H 5 hard clipping (clipped sequences NOT present in SEQ)
418 #P 6 padding (silent deletion from padded reference)
420 #X 8 sequence mismatch
422 seq_array
= bytearray
( b
'' )
423 bq_array
= bytearray
( b
'' )
425 for m
in range( len(self.cigar
) ):
429 if op
in [0, 7, 8]: # M = X alignment match (match or mismatch)
431 # in the range of a CIGAR operator
433 # find the position, now get the ref
435 seq_array
.append
( __BAMDNACODE__
[ (self.binaryseq
[ p
//2 ] >> ( (1-p
%2)*4 ) ) & 15 ] )
436 bq_array
.append
( self.binaryqual
[ p
] )
438 elif res
== op_l
- 1:
441 seq_array
.append
( __BAMDNACODE__
[ (self.binaryseq
[ p
//2 ] >> ( (1-p
%2)*4 ) ) & 15 ] )
442 bq_array
.append
( self.binaryqual
[ p
] )
443 # now add any insertion later on
445 if m
+ 1 == len( self.cigar
):
447 i
= self.cigar
[ m
+ 1 ]
450 if op
== 1: #insertion
451 for n
in range( op_l
):
453 seq_array
.append
( __BAMDNACODE__
[ (self.binaryseq
[ p
//2 ] >> ( (1-p
%2)*4 ) ) & 15 ] )
454 bq_array
.append
( self.binaryqual
[ p
] )
455 #print self.SEQ, seq_array
458 # go to the next cigar code
461 elif op
in [ 2, 3 ]: # D N
463 # find the position, however ...
464 # position located in a region in reference that not exists in query
466 seq_array
.append
( b
'*' )
467 bq_array
.append
( 93 ) #assign 93 for deletion
470 # go to the next cigar code
472 elif op
== 1 : # Insertion
474 # if res == 0: # no residue left, so return a chunk of inserted sequence
475 # print "shouldn't run this code"
476 # # first, add the insertion point
477 # seq_array = bytearray( b'~' )
478 # bq_array.append( self.binaryqual[ p ] )
479 # # then add the inserted seq
480 # for i in range( op_l ):
482 # seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
483 # bq_array.append( self.binaryqual[ p ] )
487 elif op
== 4 : # Softclip. If it's Softclip, we'd better not return the extra seq
490 if pos
== 0 or pos
== self.l
- 1:
495 return ( seq_array
, bq_array
, self.strand
, tip
, pos
)
497 #raise Exception("Not expected to see this")
499 cdef int relative_ref_pos_to_relative_query_pos
( self, long relative_ref_pos
):
500 """Convert relative pos on ref to pos on query.
505 res
= relative_ref_pos
510 if op
in [0, 7, 8]: # M = X alignment match (match or mismatch)
517 elif op
in [ 2, 3 ]: # D N
519 # position located in a region in reference that not exists in query
523 elif op
in [ 1, 4 ]: # I