simplify output msgs of callvar
[MACS.git] / MACS3 / IO / BAM.pyx
blob6c5103cbb501b4954ded1697c56b7c373e28609b
1 # cython: language_level=3
2 # cython: profile=True
3 # cython: linetrace=True
4 # Time-stamp: <2021-03-10 23:24:04 Tao Liu>
6 """
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the BSD License (see the file LICENSE included with
10 the distribution).
11 """
13 # ------------------------------------
14 # python modules
15 # ------------------------------------
16 import logging
17 from logging import info, debug
18 import struct
19 from struct import unpack
20 import gzip
21 import io
22 import os
23 import sys
24 from zlib import decompress, MAX_WBITS
26 # ------------------------------------
27 # MACS3 modules
28 # ------------------------------------
29 from MACS3.Utilities.Constants import *
30 from MACS3.Signal.ReadAlignment import ReadAlignment
32 # ------------------------------------
33 # Other modules
34 # ------------------------------------
35 from cpython cimport bool
37 import numpy as np
38 cimport numpy as np
39 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
41 is_le = sys.byteorder == "little"
43 cdef extern from "stdlib.h":
44 ctypedef unsigned int size_t
45 size_t strlen(char *s)
46 void *malloc(size_t size)
47 void *calloc(size_t n, size_t size)
48 void free(void *ptr)
49 int strcmp(char *a, char *b)
50 char * strcpy(char *a, char *b)
51 long atol(char *bytes)
52 int atoi(char *bytes)
54 # ------------------------------------
55 # constants
56 # ------------------------------------
57 __version__ = "BAM $Revision$"
58 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
59 __doc__ = "SAPPER BAMParser class"
61 # ------------------------------------
62 # Misc functions
63 # ------------------------------------
64 cdef list get_bins_by_region ( uint32_t beg, uint32_t end ):
65 """ Get the possible bins by given a region
66 """
67 cdef:
68 list bins = [ 0 ]
69 uint32_t k
70 # for different levels
71 for k in range(1 + (beg>>26), 2 + (end>>26) ):
72 bins.append( k )
73 for k in range(9 + (beg>>23), 10 + (end>>23) ):
74 bins.append( k )
75 for k in range(73 + (beg>>20), 74 + (end>>20) ):
76 bins.append( k )
77 for k in range(585 + (beg>>17), 586 + (end>>17) ):
78 bins.append( k )
79 for k in range(4681 + (beg>>14), 4682 + (end>>14) ):
80 bins.append( k )
81 return bins
83 cdef list reg2bins(rbeg, rend):
84 """Generates bin ids which overlap the specified region.
85 Args:
86 rbeg (int): inclusive beginning position of region
87 rend (int): exclusive end position of region
88 Yields:
89 (int): bin IDs for overlapping bins of region
90 Raises:
91 AssertionError (Exception): if the range is malformed or invalid
92 """
93 # Based off the algorithm presented in:
94 # https://samtools.github.io/hts-specs/SAMv1.pdf
96 # Bin calculation constants.
97 BIN_ID_STARTS = (0, 1, 9, 73, 585, 4681)
99 # Maximum range supported by specifications.
100 MAX_RNG = (2 ** 29) - 1
102 assert 0 <= rbeg <= rend <= MAX_RNG, 'Invalid region {}, {}'.format(rbeg, rend)
104 l = []
106 for start, shift in zip(BIN_ID_STARTS, range(29, 13, -3)):
107 i = rbeg >> shift if rbeg > 0 else 0
108 j = rend >> shift if rend < MAX_RNG else MAX_RNG >> shift
110 for bin_id_offset in range(i, j + 1):
111 #yield start + bin_id_offset
112 l.append( start + bin_id_offset )
113 return l
115 # ------------------------------------
116 # Classes
117 # ------------------------------------
118 class StrandFormatError( Exception ):
119 """Exception about strand format error.
121 Example:
122 raise StrandFormatError('Must be F or R','X')
124 def __init__ ( self, string, strand ):
125 self.strand = strand
126 self.string = string
128 def __str__ ( self ):
129 return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % ( self.string, self.strand ) )
131 class MDTagMissingError( Exception ):
132 """Exception about missing MD tag
134 Example:
135 raise MDTagMissingError( name, aux )
137 aux is the auxiliary data part.
139 def __init__ ( self, name, aux ):
140 self.name = name
141 self.aux = aux
143 def __str__ ( self ):
144 return repr( "MD tag is missing! Please use \"samtools calmd\" command to add MD tags!\nName of sequence:%s\nCurrent auxiliary data section: %s" % (self.name.decode(), self.aux.decode() ) )
147 cdef class BAIFile:
148 """BAI File Class for BAI (index of BAM) File.
150 While initiating the object, BAI file will be loaded and the
151 information of bins and chunks will be saved in the class object.
153 Please refer to https://samtools.github.io/hts-specs/SAMv1.pdf for
154 detail definition of BAI file.
156 cdef:
157 str filename # filename
158 object fhd # file handler
159 bytes magic # magic code for this file
160 uint32_t n_ref # number of refseqs/chromosomes
161 dict metadata # metadata for ref seqs
162 uint64_t n_bins # total number of bins
163 uint64_t n_chunks # total number of chunks
164 uint64_t n_mapped # total mapped reads
165 uint64_t n_unmapped # total unmapped reads
166 list bins
168 def __init__ ( self, str filename ):
169 """Open input file. Determine whether it's a gzipped file.
171 'filename' must be a string object.
173 This function initialize the following attributes:
175 1. self.filename: the filename for input file.
176 2. self.gzipped: a boolean indicating whether input file is gzipped.
177 3. self.fhd: buffered I/O stream of input file
179 self.filename = filename
180 self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!
181 self.magic = self.fhd.read( 4 )
182 if self.magic != b"BAI\1":
183 raise Exception(f"Not a BAI file. The first 4 bytes are \'{self.magic}\'")
184 self.n_ref = self.__read_n_ref()
185 self.bins = list( range( self.n_ref ) )
186 self.metadata = dict.fromkeys( self.bins )
187 self.__load_bins()
188 self.fhd.close()
189 return
191 def __cinit__ ( self ):
192 self.n_ref = 0
193 self.n_bins = 0
194 self.n_chunks = 0
195 self.n_mapped = 0
196 self.n_unmapped = 0
198 def __str__ ( self ):
199 tmp = list(self.bins[0].keys())[:10]
200 return f"""
201 Summary of BAI File:
202 filename: {self.filename}
203 magic: {self.magic}
204 number of reference sequences/chromosomes: {self.n_ref}
205 number of total bins: {self.n_bins}
206 number of total chunks: {self.n_chunks}
207 number of total mapped reads: {self.n_mapped}
208 number of total unmapped reads: {self.n_unmapped}
209 Example of bins: ref 0, {tmp} ..
210 Example of metadata: ref 0, {self.metadata[ 0 ]}
213 cdef uint32_t __read_n_ref ( self ):
214 cdef:
215 uint32_t ret_val
216 self.fhd.seek( 4 ) # skip magic
217 ret_val = unpack( '<I', self.fhd.read( 4 ) )[ 0 ]
218 return ret_val
220 cdef __load_bins ( self ):
221 cdef:
222 uint32_t i, j, m
223 uint32_t this_n_bin, this_n_intv
224 uint32_t this_bin
225 uint32_t this_n_chunk
226 dict this_bin_dict
227 list this_chunks
228 list pseudobin
229 # skip the magic and n_ref
230 self.fhd.seek( 8 )
231 # for each ref seq/chromosome
232 for i in range( self.n_ref ):
233 # get number of bins
234 this_n_bin= unpack( '<I', self.fhd.read( 4 ) )[ 0 ]
235 # must be less than 37451
236 assert this_n_bin <= 37451
237 # increment the total n_bins counter
238 self.n_bins += this_n_bin
239 # initiate this_bin_dict for this ref seq
240 this_bin_dict = {}
241 # for each bin
242 for j in range( this_n_bin ):
243 ( this_bin, this_n_chunk ) = unpack( '<II', self.fhd.read( 8 ) )
244 # increment the total n_chunks counter
245 self.n_chunks += this_n_chunk
246 this_chunks = []
247 for m in range( this_n_chunk ):
248 this_chunks.append( unpack( '<qq', self.fhd.read( 16 ) ) )
249 # put the list of chunks in the this_bin_dict
250 this_bin_dict[ this_bin ] = this_chunks
251 # put the this_bin_dict in the list
252 self.bins[ i ] = this_bin_dict
253 # we will skip the next linear index part -- since I don't
254 # think we need them in regular
255 # ChIP-seq/ATAC-seq/other-seq analysis. Let me know if I am wrong!
256 this_n_intv = unpack( '<I', self.fhd.read( 4 ) )[ 0 ]
257 # skip each 64bits
258 self.fhd.seek( 8 * this_n_intv, 1 )
259 # we will also skip n_no_coor, the Number of unplaced unmapped reads
260 # now extract and clean up the pseudo-bins
261 # if empty, just skip
262 for i in range( self.n_ref ):
263 if self.bins[ i ]:
264 pseudobin = self.bins[ i ].pop( 37450 )
265 self.metadata[ i ] = {"ref_beg": pseudobin[0][0], "ref_end": pseudobin[0][1],
266 "n_mapped": pseudobin[1][0], "n_unmapped": pseudobin[1][1] }
267 self.n_mapped += pseudobin[1][0]
268 self.n_unmapped += pseudobin[1][1]
269 return
271 cpdef list get_chunks_by_bin ( self, uint32_t ref_n, uint32_t bin_n ):
272 """Get the chunks by bin number, for a given ref seq (not the name,
273 but the index).
275 The chunks will be sorted using default python sorted
276 function. Therefore the result will be the order of the offset
277 of beginning of each chunks.
280 return sorted( self.bins[ ref_n ].get( bin_n, [] ) )
282 cpdef list get_chunks_by_list_of_bins ( self, uint32_t ref_n, list bins ):
283 """Similar to get_chunks_by_bin, but accept a list of bins.
285 Note: The redudant bins in the list will be removed.
288 cdef:
289 uint32_t bin_n
290 list chunks = []
291 set bin_set
292 bin_set = set( bins )
293 for bin_n in bin_set:
294 chunks.extend( self.bins[ ref_n ].get( bin_n, [] ) )
295 return sorted( chunks )
297 cpdef dict get_metadata_by_refseq ( self, uint32_t ref_n ):
298 return self.metadata[ ref_n ]
300 cpdef list get_chunks_by_region ( self, uint32_t ref_n, uint32_t beg, uint32_t end ):
301 """Get the chunks by given a region in a given refseq (not the name,
302 but the index).
305 cdef:
306 list bins
307 list chunks
308 bins = get_bins_by_region( beg, end )
309 chunks = self.get_chunks_by_list_of_bins( ref_n, bins )
310 return chunks
312 cpdef list get_chunks_by_list_of_regions ( self, uint32_t ref_n, list regions ):
313 """ Similar to get_chunks_by_region, but accept a list of regions
315 cdef:
316 int i
317 list temp_bins
318 list bins = []
319 uint32_t beg, end
321 for i in range( len(regions) ):
322 beg = regions[ i ][ 0 ]
323 end = regions[ i ][ 1 ]
324 temp_bins = get_bins_by_region( beg, end )
325 bins.extend( temp_bins )
326 return self.get_chunks_by_list_of_bins( ref_n, bins )
328 cpdef uint64_t get_coffset_by_region ( self, uint32_t ref_n, uint32_t beg, uint32_t end ):
329 """Find the offset to access the BGZF block which contains the
330 leftmost reads overlapping with the given genomic region.
332 cdef:
333 uint64_t voffset_tmp, coffset_tmp
334 list chunks
335 tuple chunk
336 int i
337 uint64_t coffset
339 chunks = self.get_chunks_by_region( ref_n, beg, end )
340 if not chunks:
341 return 0
342 coffset = chunks[0][0] >> 16
343 for i in range( 1, len(chunks) ):
344 voffset_tmp = chunks[i][0]
345 coffset_tmp = voffset_tmp >> 16
346 if coffset_tmp < coffset:
347 coffset = coffset_tmp
348 return coffset
350 cpdef uint64_t get_coffsets_by_list_of_regions ( self, uint32_t ref_n, list regions ):
351 """Find the offset to access the BGZF block which contains the
352 leftmost reads overlapping with the given genomic region.
354 cdef:
355 uint32_t beg, end
356 int i
357 uint64_t coffset
358 list coffset_list
360 coffset_list = []
361 for i in range( len(regions) ):
362 beg = regions[ i ][ 0 ]
363 end = regions[ i ][ 1 ]
364 coffset = get_coffset_by_region( ref_n, beg, end )
365 coffset_list.append( coffset )
366 return coffset_list
368 cdef class BAMaccessor:
369 """This BAM file reader is designed to access certain alignment
370 records that overlap with givin genomic coordinates.
372 The BAMaccessor needs a matching BAM index file (.bai) for fast
373 access. Please refer to SAM format specification:
375 https://samtools.github.io/hts-specs/SAMv1.pdf
377 Note, the header of BAM will still be read through 'traditional'
378 way -- using gzip to read through. But for accessing specific
379 alignments, we will read BAM as binary, access the compressed
380 chunk then use zlib.decompress.
383 cdef:
384 # all private
385 str bam_filename # BAM filename
386 str bai_filename # BAI filename
387 object bamfile # BAM file handler "rb" mode
388 BAIFile baifile # Matching BAI file
389 list references # name of references/chromosomes, contain the order of chromosomes
390 dict rlengths # lengths of references/chromosomes
391 bytes bgzf_block_cache # cache of decompressed bgzf_block
392 uint64_t coffset_cache # coffset of the cached bgzf_block
393 uint64_t noffset_cache # coffset of the next block of the cached bgzf_block
395 def __init__ ( self, str BAM_filename ):
396 """Open input file. Determine whether it's a gzipped file.
398 'filename' must be a string object.
400 It will call __parse_header to check if the file is BAM format
401 (through magic string); check if it's sorted by coordinates
402 (SO). It will then check if BAI is available.
405 self.bam_filename = BAM_filename
406 # parse the header
407 self.__parse_header()
408 # check if BAI is available
409 self.bai_filename = self.bam_filename + ".bai"
410 if os.path.exists( self.bai_filename ):
411 self.baifile = BAIFile( self.bai_filename ) # The baifile is not for IO, it already contains all content.
412 else:
413 raise Exception("BAI is not available! Please make sure the .bai")
414 self.bamfile = io.BufferedReader( io.open( self.bam_filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE ) # binary mode to read the raw BGZF file
415 self.bgzf_block_cache = b""
416 self.coffset_cache = 0
417 self.noffset_cache = 0
419 cpdef close ( self ):
420 """Run this when this Parser will be never used.
422 Close file I/O stream.
424 self.bamfile.close()
426 cdef __parse_header( self ):
427 """Parse the HEADER of BAM. It will do the following thing:
429 1. read in references from BAM header, and save in
430 self.references, and self.rlengths.
432 2. invoke self.check_sorted to see if BAM is sorted by
433 coordinates, and set self.sorted.
435 3. remember the file pointer at the end of HEADER or the start
436 of the alignment blocks, and set self.SOA.
439 cdef:
440 object fhd # file handler from gzip.open; temporary use
441 int header_len, x, nc, nlength
442 bytes magic, refname
443 list references = []
444 dict rlengths = {}
445 bytes header_text
447 # we use traditional way to read the header -- through gzip
448 fhd = io.BufferedReader( gzip.open( self.bam_filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE )
449 # get the first 4 bytes
450 magic = fhd.read( 4 )
451 if magic != b"BAM\1":
452 raise Exception( f"File \"{self.filenmame}\" is not a BAM file. The magic string is not \"BAM\\1\", but \"{magic}\" " )
453 # next 4bytes is length of header
454 header_len = unpack( '<i', fhd.read( 4 ) )[ 0 ]
455 header_text = unpack( '<%ds' % header_len, fhd.read( header_len ) )[0]
456 # get the number of chromosome
457 nc = unpack( '<i', fhd.read( 4 ) )[ 0 ]
458 for x in range( nc ):
459 # read each chromosome name
460 nlength = unpack( '<i', fhd.read( 4 ) )[ 0 ]
461 refname = fhd.read( nlength )[ :-1 ]
462 references.append( refname )
463 # don't jump over chromosome size
464 # we can use it to avoid falling of chrom ends during peak calling
465 rlengths[refname] = unpack( '<i', fhd.read( 4 ) )[ 0 ]
466 self.references = references
467 self.rlengths = rlengths
468 # read sorting information
469 if not self.__check_sorted( header_text ):
470 raise Exception( f"BAM should be sorted by coordinates!" )
471 # close
472 fhd.close()
473 return
475 cdef bool __check_sorted( self, bytes header ):
476 """Check if the BAM is sorted by looking at the HEADER text
478 cdef:
479 bytes l
480 int i
482 # HD line is the first line
483 l = header.split(b"\n")[0]
484 if l.startswith(b"@HD"):
485 # locate HD line
486 i = l.find(b"SO:")
487 if i != -1:
488 # I will only check the first 5 characters
489 if l[i+3:i+8] == b"coord":
490 return True
491 return False
493 cpdef list get_chromosomes ( self ):
494 """Get chromosomes in order of their appearance in BAM HEADER.
497 return self.references
499 cpdef dict get_rlengths ( self ):
500 """Get chromosomes in order of their appearance in BAM HEADER.
503 return self.rlengths
505 cdef tuple __decode_voffset ( self, uint64_t voffset ):
508 cdef:
509 uint64_t coffset
510 uint32_t uoffset
511 coffset = voffset >> 16
512 uoffset = voffset ^ (coffset << 16)
513 return ( coffset, uoffset )
515 cdef bool __seek ( self, uint64_t offset ):
516 """Move to given position
518 self.bamfile.seek( offset, 0 )
519 return True
521 cdef bool __retrieve_cdata_from_bgzf_block ( self ):
522 """Retrieve uncompressed CDATA from BGZF block
524 cdef:
525 uint16_t xlen, bsize
526 bytes extra, cdata
528 self.bamfile.seek( 10, 1 ) # skip 10 bytes
529 # get XLEN
530 xlen = unpack("<H", self.bamfile.read(2) )[0]
531 # get extra subfields
532 extra = self.bamfile.read(xlen)
533 bsize = unpack( '<H', extra[extra.index(b'BC\x02\x00') + 4 : ])[0]
534 cdata = self.bamfile.read( bsize - xlen - 19 )
535 self.bgzf_block_cache = decompress( cdata, -MAX_WBITS )
536 self.bamfile.seek( 8, 1 ) # move to the end of the block
537 return True
539 cpdef list get_reads_in_region ( self, bytes chrom, int left, int right, int maxDuplicate = 1 ):
540 """Get reads in a given region.
542 Return: list of ReadAlignment
544 cdef:
545 int entrylength, fpos, strand, chrid
546 list readslist
547 int cur_duplicates = 0
548 object read
549 object previous_read
550 int chrom_index
551 bytes dcdata
552 bool flag_end_searching = False
553 uint64_t coffset
555 readslist = []
556 cur_duplicates = 0
558 previous_read = None
559 chrom_index = self.references.index( chrom )
561 # get the coffset from BAI
562 coffset = self.baifile.get_coffset_by_region ( chrom_index, left, right )
563 if coffset == 0:
564 return readslist
565 if coffset != self.coffset_cache:
566 self.coffset_cache = coffset
567 self.__seek( coffset )
568 self.__retrieve_cdata_from_bgzf_block ( ) # note: self.cached_bgzf_block will be updated.
569 dcdata = self.bgzf_block_cache
571 while True:
572 dcdata_length = len(dcdata)
573 # now parse the reads in dcdata
574 i_bytes = 0
575 while i_bytes < dcdata_length:
576 entrylength = unpack( '<I', dcdata[ i_bytes : i_bytes + 4 ] )[ 0 ]
577 i_bytes += 4
578 # I am not sure if i_bytes+entrylength can be outside of dcdata_length...
579 read = self.__fw_binary_parse( dcdata[ i_bytes : i_bytes + entrylength ] )
580 i_bytes += entrylength
582 if read == None:
583 # no mapping information, skip to next one
584 continue
585 if read["lpos"] > right:
586 # outside of region, end searching
587 flag_end_searching = True
588 break
589 elif read["rpos"] > left:
590 # found overlap
591 # check if redundant
592 if previous_read != None and previous_read["lpos"] == read["lpos"] and previous_read["rpos"] == read["rpos"] and previous_read["strand"] == read["strand"] and previous_read["cigar"] == read["cigar"]:
593 cur_duplicates += 1
594 else:
595 cur_duplicates = 1
596 if cur_duplicates <= maxDuplicate:
597 readslist.append( read )
598 previous_read = read
599 if flag_end_searching:
600 break
601 # if not, get the next block, search again
602 self.coffset_cache = self.bamfile.tell()
603 self.__retrieve_cdata_from_bgzf_block ()
604 dcdata = self.bgzf_block_cache
605 if not dcdata:
606 # empty means EOF is reached.
607 break
608 return readslist
610 cdef object __fw_binary_parse (self, data, min_MAPQ=1 ):
611 """ Read information from an alignment block. Discard
612 alignment below a minimum MAPQ score, default: 1.
614 Return: ReadAlignment object with only selected information,
615 including refname (chromosome), leftmost alignment location,
616 strand information, sequence, quality, CIGAR and MD tag.
618 Note: We do require MD tag exists in BAM file.
620 cdef:
621 int ref, leftmost, rightmost, i, j, l_seq, strand
622 short bwflag, l_read_name, n_cigar_op, MAPQ
623 short bin_bam
624 bytes read_name
625 bytes seq #note: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
626 bytes qual
627 tuple cigar_op # op_len<<4|op, op: "MIDNSHP=X" -> 012345678
628 bytes tag
629 bytes MD
631 if not data: return None
633 # read number of CIGAR operators, Bitwise FLAG
634 (n_cigar_op, bwflag ) = unpack( '<HH' , data[ 12:16 ] )
636 # first, we will discard problematic reads
637 if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
638 return None #unmapped sequence or bad sequence or secondary or supplementary alignment
639 if bwflag & 1:
640 # paired read. We should only keep sequence if the mate is mapped
641 # Different with other MACS subcommands, both reads will be kept.
642 if not bwflag & 2:
643 return None # not a proper pair
644 if bwflag & 8:
645 return None # the mate is unmapped
647 # read length of readname, MAPQ, and bin information
648 (l_read_name, MAPQ, bin_bam) = unpack( '<BBH', data[ 8:12 ] )
650 # we will also discard reads having MAPQ lower than min_MAPQ. MAPQ=255 means no alignment
651 if MAPQ < min_MAPQ or MAPQ == 255:
652 return None
654 # Now we read other information
656 # read ref name
657 ref = unpack( '<i', data[ 0:4 ] )[ 0 ]
658 # read leftmost position of alignment
659 leftmost = unpack( '<i', data[ 4:8 ] )[ 0 ]
660 # read length of query sequence
661 l_seq = unpack( '<i', data[ 16:20 ] )[0]
662 # readname , skip next_refID, next_pos, tlen, which is 12 bytes or from the 32th byte
663 read_name = unpack( '<%ds' % (l_read_name), data[ 32: 32+l_read_name ] )[0][:-1] # last byte is \x00
664 # cigar_op, find the index i first.
665 i = 32 + l_read_name
666 cigar_op = unpack( '<%dI' % (n_cigar_op) , data[ i : i + n_cigar_op*4 ] )
667 # read sequence information
668 i += n_cigar_op*4
669 seq = unpack( '<%ds' % int((l_seq+1)/2), data[ i: i + int((l_seq+1)/2)] )[0]
670 # read quality information. Note: the value in BAM is the acutal Phred score, there is no +33!
671 i += int((l_seq+1)/2)
672 qual = unpack( '<%ds' % (l_seq), data[ i : i + l_seq])[0]
674 rightmost = leftmost
675 for j in cigar_op:
676 if j & 15 in [ 0, 2, 3, 7, 8 ]: # they are CIGAR op M/D/N/=/X, no need to add I, S, H, or P
677 rightmost += j >> 4
679 # strand information
680 if bwflag & 16:
681 # reverse strand
682 strand = 1
683 else:
684 strand = 0
686 # MD tag
687 MD = b''
688 i += l_seq
689 tag = data[ i : ]
690 j = tag.find(b'MDZ')
691 if j == -1: raise MDTagMissingError( data[ 32: 32+l_read_name], tag )
692 MD = tag[ j+3 : tag[j:].find(b"\0") + j ]
694 # construct a ReadAlignment object and return
695 return ReadAlignment( read_name, self.references[ref], leftmost, rightmost, strand, seq, qual, cigar_op, MD )