Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / ReadAlignment.pyx
blobd5376350b6b542581b666d3652b0d14761105212
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2021-03-10 16:21:51 Tao Liu>
5 """Module for SAPPER ReadAlignment class
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file COPYING included
9 with the distribution).
10 """
12 # ------------------------------------
13 # python modules
14 # ------------------------------------
15 from cpython cimport bool
17 cdef extern from "stdlib.h":
18 ctypedef unsigned int size_t
19 size_t strlen(char *s)
20 void *malloc(size_t size)
21 void *calloc(size_t n, size_t size)
22 void free(void *ptr)
23 int strcmp(char *a, char *b)
24 char * strcpy(char *a, char *b)
25 long atol(char *bytes)
26 int atoi(char *bytes)
28 # ------------------------------------
29 # constants
30 # ------------------------------------
32 __BAMDNACODE__ = b"=ACMGRSVTWYHKDBN"
33 __CIGARCODE__ = "MIDNSHP=X"
34 __DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
36 # -- CIGAR CODE --
37 #OP BAM Description
38 #M 0 alignment match (can be a sequence match or mismatch) insertion to the reference
39 #I 1 insertion to the reference
40 #D 2 deletion from the reference
41 #N 3 skipped region from the reference
42 #S 4 soft clipping (clipped sequences present in SEQ)
43 #H 5 hard clipping (clipped sequences NOT present in SEQ)
44 #P 6 padding (silent deletion from padded reference)
45 #= 7 sequence match
46 #X 8 sequence mismatch
47 # -- -- -- -- -- --
49 # ------------------------------------
50 # Misc functions
51 # ------------------------------------
53 # ------------------------------------
54 # Classes
55 # ------------------------------------
56 cdef class ReadAlignment:
57 cdef:
58 bytes readname
59 bytes chrom
60 int lpos
61 int rpos
62 int strand # strand information. 0 means forward strand, 1 means reverse strand.
63 bytes binaryseq
64 bytes binaryqual
65 int l # length of read
66 tuple cigar # each item contains op_l|op
67 bytes MD
68 int n_edits # number of edits; higher the number,
69 # more differences with reference
70 bytes SEQ # sequence of read regarding to + strand
71 bytes QUAL # quality of read regarding to + strand
73 def __init__ ( self,
74 bytes readname,
75 bytes chrom, int lpos, int rpos,
76 int strand,
77 bytes binaryseq,
78 bytes binaryqual,
79 tuple cigar,
80 bytes MD ):
81 self.readname = readname
82 self.chrom = chrom
83 self.lpos = lpos
84 self.rpos = rpos
85 self.strand = strand
86 self.binaryseq = binaryseq
87 self.binaryqual = binaryqual
88 self.l = len( binaryqual )
89 self.cigar = cigar
90 self.MD = MD
91 self.n_edits = self.get_n_edits()
92 (self.SEQ, self.QUAL) = self.__get_SEQ_QUAL()
94 cdef int get_n_edits( self ):
95 """The number is from self.cigar and self.MD.
97 """
98 cdef:
99 int n_edits
100 int i, cigar_op, cigar_op_l
101 char c
103 n_edits = 0
104 for i in self.cigar: # only count insertion or softclip
105 cigar_op = i & 15
106 cigar_op_l = i >> 4
107 if cigar_op in [ 1, 4 ]: # count Insertion or Softclip
108 n_edits += cigar_op_l
110 for c in self.MD:
111 if (c > 64 and c < 91): # either deletion in query or mismatch
112 n_edits += 1
113 return n_edits
115 def __str__ ( self ):
116 c = self.chrom.decode()
117 n = self.readname.decode()
118 if self.strand:
119 s = "-"
120 else:
121 s = "+"
122 return f"{c}\t{self.lpos}\t{self.rpos}\t{n}\t{self.l}\t{s}"
124 def __getitem__ ( self, keyname ):
125 if keyname == "readname":
126 return self.readname
127 elif keyname == "chrom":
128 return self.chrom
129 elif keyname == "lpos":
130 return self.lpos
131 elif keyname == "rpos":
132 return self.rpos
133 elif keyname == "strand":
134 return self.strand
135 elif keyname == "SEQ":
136 return self.SEQ
137 elif keyname == "QUAL":
138 return self.QUAL
139 elif keyname == "n_edits":
140 return self.n_edits
141 elif keyname == "binaryseq":
142 return self.binaryseq
143 elif keyname == "binaryqual":
144 return self.binaryqual
145 elif keyname == "l":
146 return self.l
147 elif keyname == "cigar":
148 return self.cigar
149 elif keyname == "MD":
150 return self.MD
151 else:
152 raise KeyError("No such key", keyname)
154 def __getstate__ ( self ):
155 return ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL )
157 def __setstate__ ( self, state ):
158 ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL ) = state
160 # cpdef bytearray get_SEQ ( self ):
161 # """Convert binary seq to ascii seq.
163 # Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
165 # Note: In BAM, if a sequence is mapped to reverse strand, the
166 # reverse complement seq is written in SEQ field. So the return
167 # value of this function will not be the original one if the
168 # read is mapped to - strand.
169 # """
170 # cdef:
171 # char c
172 # bytearray seq
174 # seq = bytearray(b"")
175 # for c in self.binaryseq:
176 # # high
177 # seq.append( __BAMDNACODE__[c >> 4 & 15] )
178 # # low
179 # seq.append( __BAMDNACODE__[c & 15] )
180 # if seq[-1] == b"=":
181 # # trim the last '=' if it exists
182 # seq = seq[:-1]
183 # return seq
185 cdef tuple __get_SEQ_QUAL ( self ):
186 """Get tuple of (SEQ, QUAL).
188 Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
190 Note: In BAM, if a sequence is mapped to reverse strand, the
191 reverse complement seq is written in SEQ field. So the return
192 value of this function will not be the original one if the
193 read is mapped to - strand. If you need to original one, do
194 reversecomp for SEQ and reverse QUAL.
196 cdef:
197 int i
198 char c
199 bytearray seq
200 bytearray qual
202 seq = bytearray(b"")
203 qual = bytearray(b"")
205 for i in range( len(self.binaryseq) ):
206 c = self.binaryseq[ i ]
207 # high
208 seq.append( __BAMDNACODE__[c >> 4 & 15] )
209 # low
210 seq.append( __BAMDNACODE__[c & 15] )
212 for i in range( len( self.binaryqual ) ):
213 # qual is the -10log10 p or phred score.
214 qual.append( self.binaryqual[i] )
216 if seq[-1] == b"=":
217 # trim the last '=' if it exists
218 seq = seq[:-1]
219 assert len( seq ) == len( qual ), Exception("Lengths of seq and qual are not consistent!")
221 # Example on how to get original SEQ and QUAL:
222 #if self.strand:
223 # seq.reverse()
224 # #compliment
225 # seq = seq.translate( __DNACOMPLEMENT__ )
226 # qual.reverse()
228 return ( bytes(seq), bytes(qual) )
231 cpdef bytes get_FASTQ ( self ):
232 """Get FASTQ format text.
235 cdef:
236 bytes seq
237 bytearray qual
239 seq = self.SEQ
240 qual = bytearray(self.QUAL)
242 for i in range( len( self.QUAL ) ):
243 # qual is the -10log10 p or phred score, to make FASTQ, we have to add 33
244 qual[ i ] += 33
246 # reverse while necessary
247 if self.strand:
248 seq = self.SEQ[::-1]
249 #compliment
250 seq = seq.translate( __DNACOMPLEMENT__ )
251 qual = qual[::-1]
252 else:
253 seq = self.SEQ
255 return b"@" + self.readname + b"\n" + seq + b"\n+\n" + qual + b"\n"
257 cpdef bytearray get_REFSEQ ( self ):
258 """Fetch reference sequence, using self.MD and self.cigar
260 cdef:
261 char c
262 bytearray seq, refseq
263 int i, cigar_op, cigar_op_l
264 bytearray MD_op
265 int ind
266 bool flag_del # flag for deletion event in query
268 seq = bytearray(self.SEQ) # we start with read seq then make modifications
270 # 2-step proces
271 # First step: use CIGAR to edit SEQ to remove S (softclip) and I (insert)
272 # __CIGARCODE__ = "MIDNSHP=X"
273 # let ind be the index in SEQ
274 ind = 0
275 for i in self.cigar:
276 cigar_op = i & 15
277 cigar_op_l = i >> 4
278 if cigar_op in [2, 5, 6]: # do nothing for Deletion (we will
279 # put sequence back in step 2),
280 # Hardclip and Padding
281 pass
282 elif cigar_op in [0, 7, 8]: # M = X alignment match (match or
283 # mismatch)
284 # do nothing and move ind
285 ind += cigar_op_l
286 elif cigar_op in [ 1, 4 ]: # Remove for Insertion or Softclip
287 seq[ ind : ind + cigar_op_l ] = b''
289 # now the seq should be at the same length as rpos-lpos
291 # Second step: use MD string to edit SEQ to put back 'deleted
292 # seqs' and modify mismatches
294 # let ind be the index in SEQ again, from 0
295 ind = 0
296 MD_op = bytearray(b'')
297 flag_del = False
298 for c in self.MD:
299 if c < 58 and c > 47:
300 # means Match
301 flag_del = False
302 MD_op.append(c)
303 elif (c > 64 and c < 91) and not flag_del:
304 # An alphabet means Mismatch, Note, if MD is made
305 # right, a mismatch should only be 1 letter surrounded
306 # by digits.
307 ind += int(MD_op)
308 seq[ ind ] = c
309 ind += 1
310 # reset MD_op
311 MD_op = bytearray(b'')
312 elif (c > 64 and c < 91) and flag_del:
313 seq[ ind:ind ] = [c,]
314 ind += 1
315 elif c == 94:
316 # means Deletion in query. Now, insert a sequnce into
317 # SEQ
318 flag_del = True
319 ind += int(MD_op)
320 # reset MD_op
321 MD_op = bytearray(b'')
322 else:
323 raise Exception("Don't understand this operator in MD: %c" % c)
324 #print( seq.decode() )
326 return seq
328 cpdef get_base_by_ref_pos ( self, long ref_pos ):
329 """Get base by ref position.
332 cdef:
333 int relative_pos, p
334 assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
335 relative_pos = ref_pos - self.lpos
336 p = self.relative_ref_pos_to_relative_query_pos( relative_pos )
338 if p == -1: # located in a region deleted in query
339 return None
340 else:
341 return __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ]
343 cpdef get_bq_by_ref_pos ( self, long ref_pos ):
344 """Get base quality by ref position. Base quality is in Phred scale.
346 Returned value is the raw Phred-scaled base quality.
349 cdef:
350 int relative_pos, p
351 assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
352 relative_pos = ref_pos - self.lpos
353 p = self.relative_ref_pos_to_relative_query_pos( relative_pos )
355 if p == -1: # located in a region deleted in query
356 return None
357 else:
358 return self.binaryqual[p]
360 cpdef tuple get_base_bq_by_ref_pos ( self, long ref_pos ):
361 """Get base and base quality by ref position. Base quality is in Phred scale.
363 Returned bq is the raw Phred-scaled base quality.
365 cdef:
366 int relative_pos, p
367 assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
368 relative_pos = ref_pos - self.lpos
369 p = self.relative_ref_pos_to_relative_query_pos( relative_pos )
371 if p == -1: # located in a region deleted in query
372 return None
373 else:
374 return ( __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ], self.binaryqual[p] )
376 cpdef tuple get_variant_bq_by_ref_pos ( self, long ref_pos ):
377 """Get any variants (different with reference) and base quality by ref position.
379 variants will be
381 1) =, if identical
383 2) A/T/C/G, if SNV
385 3) -, if the reference base is deleted, in this case, bq will
386 be the highest possible bq, which is 93.
388 4) ^<A/T/C/G>+, if there is an insertion at the location
390 Base quality is the raw Phred-scaled base quality.
393 cdef:
394 int i, m, n
395 int res, p, op, op_l
396 int pos
397 bool tip
398 bytearray refseq
399 bytes p_refseq, p_seq
400 bytearray seq_array
401 bytearray bq_array
403 assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
405 res = ref_pos - self.lpos # residue
406 p = 0
408 refseq = self.get_REFSEQ()
409 p_refseq = refseq[ res ]
410 # -- CIGAR CODE --
411 #OP BAM Description
412 #M 0 alignment match (can be a sequence match or mismatch) insertion to the reference
413 #I 1 insertion to the reference
414 #D 2 deletion from the reference
415 #N 3 skipped region from the reference
416 #S 4 soft clipping (clipped sequences present in SEQ)
417 #H 5 hard clipping (clipped sequences NOT present in SEQ)
418 #P 6 padding (silent deletion from padded reference)
419 #= 7 sequence match
420 #X 8 sequence mismatch
422 seq_array = bytearray( b'' )
423 bq_array = bytearray( b'' )
425 for m in range( len(self.cigar) ):
426 i = self.cigar[ m ]
427 op = i & 15
428 op_l = i >> 4
429 if op in [0, 7, 8]: # M = X alignment match (match or mismatch)
430 if res < op_l - 1:
431 # in the range of a CIGAR operator
432 p += res
433 # find the position, now get the ref
434 pos = p
435 seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
436 bq_array.append( self.binaryqual[ p ] )
437 break
438 elif res == op_l - 1:
439 p += res
440 pos = p
441 seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
442 bq_array.append( self.binaryqual[ p ] )
443 # now add any insertion later on
444 # get next cigar
445 if m + 1 == len( self.cigar ):
446 break
447 i = self.cigar[ m + 1 ]
448 op = i & 15
449 op_l = i >> 4
450 if op == 1: #insertion
451 for n in range( op_l ):
452 p += 1
453 seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
454 bq_array.append( self.binaryqual[ p ] )
455 #print self.SEQ, seq_array
456 break
457 else:
458 # go to the next cigar code
459 p += op_l
460 res -= op_l
461 elif op in [ 2, 3 ]: # D N
462 if res < op_l:
463 # find the position, however ...
464 # position located in a region in reference that not exists in query
465 pos = p
466 seq_array.append( b'*' )
467 bq_array.append( 93 ) #assign 93 for deletion
468 break
469 else:
470 # go to the next cigar code
471 res -= op_l
472 elif op == 1 : # Insertion
473 p += op_l
474 # if res == 0: # no residue left, so return a chunk of inserted sequence
475 # print "shouldn't run this code"
476 # # first, add the insertion point
477 # seq_array = bytearray( b'~' )
478 # bq_array.append( self.binaryqual[ p ] )
479 # # then add the inserted seq
480 # for i in range( op_l ):
481 # p += 1
482 # seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
483 # bq_array.append( self.binaryqual[ p ] )
484 # break
485 # else:
486 # p += op_l
487 elif op == 4 : # Softclip. If it's Softclip, we'd better not return the extra seq
488 p += op_l
490 if pos == 0 or pos == self.l - 1:
491 tip = True
492 else:
493 tip = False
495 return ( seq_array, bq_array, self.strand, tip, pos )
496 # last position ?
497 #raise Exception("Not expected to see this")
499 cdef int relative_ref_pos_to_relative_query_pos ( self, long relative_ref_pos ):
500 """Convert relative pos on ref to pos on query.
502 cdef:
503 int p, res, op, op_l
504 p = 0
505 res = relative_ref_pos
507 for i in self.cigar:
508 op = i & 15
509 op_l = i >> 4
510 if op in [0, 7, 8]: # M = X alignment match (match or mismatch)
511 if res < op_l:
512 p += res
513 return p
514 else:
515 p += op_l
516 res -= op_l
517 elif op in [ 2, 3 ]: # D N
518 if res < op_l:
519 # position located in a region in reference that not exists in query
520 return -1
521 else:
522 res -= op_l
523 elif op in [ 1, 4 ]: # I
524 p += op_l
525 return p
528 ### End ###