MACS3/IO/BAM.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # cython: linetrace=True
   4 # Time-stamp: <2021-03-10 23:24:04 Tao Liu>
   5
   6 """
   7
   8 This code is free software; you can redistribute it and/or modify it
   9 under the terms of the BSD License (see the file LICENSE included with
  10 the distribution).
  11 """
  12
  13 # ------------------------------------
  14 # python modules
  15 # ------------------------------------
  16 import logging
  17 from logging import info, debug
  18 import struct
  19 from struct import unpack
  20 import gzip
  21 import io
  22 import os
  23 import sys
  24 from zlib import decompress, MAX_WBITS
  25
  26 # ------------------------------------
  27 # MACS3 modules
  28 # ------------------------------------
  29 from MACS3.Utilities.Constants import *
  30 from MACS3.Signal.ReadAlignment import ReadAlignment
  31
  32 # ------------------------------------
  33 # Other modules
  34 # ------------------------------------
  35 from cpython cimport bool
  36
  37 import numpy as np
  38 cimport numpy as np
  39 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
  40
  41 is_le = sys.byteorder == "little"
  42
  43 cdef extern from "stdlib.h":
  44     ctypedef unsigned int size_t
  45     size_t strlen(char *s)
  46     void *malloc(size_t size)
  47     void *calloc(size_t n, size_t size)
  48     void free(void *ptr)
  49     int strcmp(char *a, char *b)
  50     char * strcpy(char *a, char *b)
  51     long atol(char *bytes)
  52     int atoi(char *bytes)
  53
  54 # ------------------------------------
  55 # constants
  56 # ------------------------------------
  57 __version__ = "BAM $Revision$"
  58 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
  59 __doc__ = "SAPPER BAMParser class"
  60
  61 # ------------------------------------
  62 # Misc functions
  63 # ------------------------------------
  64 cdef list get_bins_by_region ( uint32_t beg, uint32_t end ):
  65     """ Get the possible bins by given a region
  66     """
  67     cdef:
  68         list bins = [ 0 ]
  69         uint32_t k
  70     # for different levels
  71     for k in range(1 + (beg>>26), 2 + (end>>26) ):
  72         bins.append( k )
  73     for k in range(9 + (beg>>23), 10 + (end>>23) ):
  74         bins.append( k )
  75     for k in range(73 + (beg>>20), 74 + (end>>20) ):
  76         bins.append( k )
  77     for k in range(585 + (beg>>17), 586 + (end>>17) ):
  78         bins.append( k )
  79     for k in range(4681 + (beg>>14), 4682 + (end>>14) ):
  80         bins.append( k )
  81     return bins
  82
  83 cdef list reg2bins(rbeg, rend):
  84     """Generates bin ids which overlap the specified region.
  85     Args:
  86         rbeg (int): inclusive beginning position of region
  87         rend (int): exclusive end position of region
  88     Yields:
  89         (int): bin IDs for overlapping bins of region
  90     Raises:
  91         AssertionError (Exception): if the range is malformed or invalid
  92     """
  93     # Based off the algorithm presented in:
  94     # https://samtools.github.io/hts-specs/SAMv1.pdf
  95
  96     # Bin calculation constants.
  97     BIN_ID_STARTS = (0, 1, 9, 73, 585, 4681)
  98
  99     # Maximum range supported by specifications.
 100     MAX_RNG = (2 ** 29) - 1
 101
 102     assert 0 <= rbeg <= rend <= MAX_RNG, 'Invalid region {}, {}'.format(rbeg, rend)
 103
 104     l = []
 105
 106     for start, shift in zip(BIN_ID_STARTS, range(29, 13, -3)):
 107         i = rbeg >> shift if rbeg > 0 else 0
 108         j = rend >> shift if rend < MAX_RNG else MAX_RNG >> shift
 109
 110         for bin_id_offset in range(i, j + 1):
 111             #yield start + bin_id_offset
 112             l.append( start + bin_id_offset )
 113     return l
 114
 115 # ------------------------------------
 116 # Classes
 117 # ------------------------------------
 118 class StrandFormatError( Exception ):
 119     """Exception about strand format error.
 120
 121     Example:
 122     raise StrandFormatError('Must be F or R','X')
 123     """
 124     def __init__ ( self, string, strand ):
 125         self.strand = strand
 126         self.string = string
 127
 128     def __str__ ( self ):
 129         return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % ( self.string, self.strand ) )
 130
 131 class MDTagMissingError( Exception ):
 132     """Exception about missing MD tag
 133
 134     Example:
 135     raise MDTagMissingError( name, aux )
 136
 137     aux is the auxiliary data part.
 138     """
 139     def __init__ ( self, name, aux ):
 140         self.name = name
 141         self.aux = aux
 142
 143     def __str__ ( self ):
 144         return repr( "MD tag is missing! Please use \"samtools calmd\" command to add MD tags!\nName of sequence:%s\nCurrent auxiliary data section: %s" % (self.name.decode(), self.aux.decode() ) )
 145
 146
 147 cdef class BAIFile:
 148     """BAI File Class for BAI (index of BAM) File.
 149
 150     While initiating the object, BAI file will be loaded and the
 151     information of bins and chunks will be saved in the class object.
 152
 153     Please refer to https://samtools.github.io/hts-specs/SAMv1.pdf for
 154     detail definition of BAI file.
 155     """
 156     cdef:
 157         str filename            # filename
 158         object fhd              # file handler
 159         bytes magic             # magic code for this file
 160         uint32_t n_ref          # number of refseqs/chromosomes
 161         dict metadata           # metadata for ref seqs
 162         uint64_t n_bins         # total number of bins
 163         uint64_t n_chunks       # total number of chunks
 164         uint64_t n_mapped       # total mapped reads
 165         uint64_t n_unmapped     # total unmapped reads
 166         list bins
 167
 168     def __init__ ( self, str filename ):
 169         """Open input file. Determine whether it's a gzipped file.
 170
 171         'filename' must be a string object.
 172
 173         This function initialize the following attributes:
 174
 175         1. self.filename: the filename for input file.
 176         2. self.gzipped: a boolean indicating whether input file is gzipped.
 177         3. self.fhd: buffered I/O stream of input file
 178         """
 179         self.filename = filename
 180         self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!
 181         self.magic = self.fhd.read( 4 )
 182         if self.magic != b"BAI\1":
 183             raise Exception(f"Not a BAI file. The first 4 bytes are \'{self.magic}\'")
 184         self.n_ref = self.__read_n_ref()
 185         self.bins = list( range( self.n_ref ) )
 186         self.metadata = dict.fromkeys( self.bins )
 187         self.__load_bins()
 188         self.fhd.close()
 189         return
 190
 191     def __cinit__ ( self ):
 192         self.n_ref = 0
 193         self.n_bins = 0
 194         self.n_chunks = 0
 195         self.n_mapped = 0
 196         self.n_unmapped = 0
 197
 198     def __str__ ( self ):
 199         tmp = list(self.bins[0].keys())[:10]
 200         return f"""
 201 Summary of BAI File:
 202 filename: {self.filename}
 203 magic: {self.magic}
 204 number of reference sequences/chromosomes: {self.n_ref}
 205 number of total bins: {self.n_bins}
 206 number of total chunks: {self.n_chunks}
 207 number of total mapped reads: {self.n_mapped}
 208 number of total unmapped reads: {self.n_unmapped}
 209 Example of bins: ref 0, {tmp} ..
 210 Example of metadata: ref 0, {self.metadata[ 0 ]}
 211 """
 212
 213     cdef uint32_t __read_n_ref ( self ):
 214         cdef:
 215             uint32_t ret_val
 216         self.fhd.seek( 4 )      # skip magic
 217         ret_val = unpack( '<I', self.fhd.read( 4 ) )[ 0 ]
 218         return ret_val
 219
 220     cdef __load_bins ( self ):
 221         cdef:
 222             uint32_t i, j, m
 223             uint32_t this_n_bin, this_n_intv
 224             uint32_t this_bin
 225             uint32_t this_n_chunk
 226             dict this_bin_dict
 227             list this_chunks
 228             list pseudobin
 229         # skip the magic and n_ref
 230         self.fhd.seek( 8 )
 231         # for each ref seq/chromosome
 232         for i in range( self.n_ref ):
 233             # get number of bins
 234             this_n_bin= unpack( '<I', self.fhd.read( 4 ) )[ 0 ]
 235             # must be less than 37451
 236             assert this_n_bin <= 37451
 237             # increment the total n_bins counter
 238             self.n_bins += this_n_bin
 239             # initiate this_bin_dict for this ref seq
 240             this_bin_dict = {}
 241             # for each bin
 242             for j in range( this_n_bin ):
 243                 ( this_bin, this_n_chunk ) = unpack( '<II', self.fhd.read( 8 ) )
 244                 # increment the total n_chunks counter
 245                 self.n_chunks += this_n_chunk
 246                 this_chunks = []
 247                 for m in range( this_n_chunk ):
 248                     this_chunks.append( unpack( '<qq', self.fhd.read( 16 ) ) )
 249                 # put the list of chunks in the this_bin_dict
 250                 this_bin_dict[ this_bin ] = this_chunks
 251             # put the this_bin_dict in the list
 252             self.bins[ i ] = this_bin_dict
 253             # we will skip the next linear index part -- since I don't
 254             # think we need them in regular
 255             # ChIP-seq/ATAC-seq/other-seq analysis. Let me know if I am wrong!
 256             this_n_intv = unpack( '<I', self.fhd.read( 4 ) )[ 0 ]
 257             # skip each 64bits
 258             self.fhd.seek( 8 * this_n_intv, 1 )
 259         # we will also skip n_no_coor, the Number of unplaced unmapped reads
 260         # now extract and clean up the pseudo-bins
 261         # if empty, just skip
 262         for i in range( self.n_ref ):
 263             if self.bins[ i ]:
 264                 pseudobin = self.bins[ i ].pop( 37450 )
 265                 self.metadata[ i ] = {"ref_beg": pseudobin[0][0], "ref_end": pseudobin[0][1],
 266                                     "n_mapped": pseudobin[1][0], "n_unmapped": pseudobin[1][1] }
 267                 self.n_mapped += pseudobin[1][0]
 268                 self.n_unmapped += pseudobin[1][1]
 269         return
 270
 271     cpdef list get_chunks_by_bin ( self, uint32_t ref_n, uint32_t bin_n ):
 272         """Get the chunks by bin number, for a given ref seq (not the name,
 273         but the index).
 274
 275         The chunks will be sorted using default python sorted
 276         function. Therefore the result will be the order of the offset
 277         of beginning of each chunks.
 278
 279         """
 280         return sorted( self.bins[ ref_n ].get( bin_n, [] ) )
 281
 282     cpdef list get_chunks_by_list_of_bins ( self, uint32_t ref_n, list bins ):
 283         """Similar to get_chunks_by_bin, but accept a list of bins.
 284
 285         Note: The redudant bins in the list will be removed.
 286
 287         """
 288         cdef:
 289             uint32_t bin_n
 290             list chunks = []
 291             set bin_set
 292         bin_set = set( bins )
 293         for bin_n in bin_set:
 294             chunks.extend( self.bins[ ref_n ].get( bin_n, [] ) )
 295         return sorted( chunks )
 296
 297     cpdef dict get_metadata_by_refseq ( self, uint32_t ref_n ):
 298         return self.metadata[ ref_n ]
 299
 300     cpdef list get_chunks_by_region ( self, uint32_t ref_n, uint32_t beg, uint32_t end ):
 301         """Get the chunks by given a region in a given refseq (not the name,
 302         but the index).
 303
 304         """
 305         cdef:
 306             list bins
 307             list chunks
 308         bins = get_bins_by_region( beg, end )
 309         chunks = self.get_chunks_by_list_of_bins( ref_n, bins )
 310         return chunks
 311
 312     cpdef list get_chunks_by_list_of_regions ( self, uint32_t ref_n, list regions ):
 313         """ Similar to get_chunks_by_region, but accept a list of regions
 314         """
 315         cdef:
 316             int i
 317             list temp_bins
 318             list bins = []
 319             uint32_t beg, end
 320
 321         for i in range( len(regions) ):
 322             beg = regions[ i ][ 0 ]
 323             end = regions[ i ][ 1 ]
 324             temp_bins = get_bins_by_region( beg, end )
 325             bins.extend( temp_bins )
 326         return self.get_chunks_by_list_of_bins( ref_n, bins )
 327
 328     cpdef uint64_t get_coffset_by_region ( self, uint32_t ref_n, uint32_t beg, uint32_t end ):
 329         """Find the offset to access the BGZF block which contains the
 330         leftmost reads overlapping with the given genomic region.
 331         """
 332         cdef:
 333             uint64_t voffset_tmp, coffset_tmp
 334             list chunks
 335             tuple chunk
 336             int i
 337             uint64_t coffset
 338
 339         chunks = self.get_chunks_by_region( ref_n, beg, end )
 340         if not chunks:
 341             return 0
 342         coffset = chunks[0][0] >> 16
 343         for i in range( 1, len(chunks) ):
 344             voffset_tmp = chunks[i][0]
 345             coffset_tmp = voffset_tmp >> 16
 346             if coffset_tmp < coffset:
 347                 coffset = coffset_tmp
 348         return coffset
 349
 350     cpdef uint64_t get_coffsets_by_list_of_regions ( self, uint32_t ref_n, list regions ):
 351         """Find the offset to access the BGZF block which contains the
 352         leftmost reads overlapping with the given genomic region.
 353         """
 354         cdef:
 355             uint32_t beg, end
 356             int i
 357             uint64_t coffset
 358             list coffset_list
 359
 360         coffset_list = []
 361         for i in range( len(regions) ):
 362             beg = regions[ i ][ 0 ]
 363             end = regions[ i ][ 1 ]
 364             coffset = get_coffset_by_region( ref_n, beg, end )
 365             coffset_list.append( coffset )
 366         return coffset_list
 367
 368 cdef class BAMaccessor:
 369     """This BAM file reader is designed to access certain alignment
 370     records that overlap with givin genomic coordinates.
 371
 372     The BAMaccessor needs a matching BAM index file (.bai) for fast
 373     access. Please refer to SAM format specification:
 374
 375     https://samtools.github.io/hts-specs/SAMv1.pdf
 376
 377     Note, the header of BAM will still be read through 'traditional'
 378     way -- using gzip to read through. But for accessing specific
 379     alignments, we will read BAM as binary, access the compressed
 380     chunk then use zlib.decompress.
 381
 382     """
 383     cdef:
 384         # all private
 385         str bam_filename        # BAM filename
 386         str bai_filename        # BAI filename
 387         object bamfile          # BAM file handler "rb" mode
 388         BAIFile baifile         # Matching BAI file
 389         list references         # name of references/chromosomes, contain the order of chromosomes
 390         dict rlengths           # lengths of references/chromosomes
 391         bytes bgzf_block_cache  # cache of decompressed bgzf_block
 392         uint64_t coffset_cache  # coffset of the cached bgzf_block
 393         uint64_t noffset_cache  # coffset of the next block of the cached bgzf_block
 394
 395     def __init__ ( self, str BAM_filename ):
 396         """Open input file. Determine whether it's a gzipped file.
 397
 398         'filename' must be a string object.
 399
 400         It will call __parse_header to check if the file is BAM format
 401         (through magic string); check if it's sorted by coordinates
 402         (SO). It will then check if BAI is available.
 403
 404         """
 405         self.bam_filename = BAM_filename
 406         # parse the header
 407         self.__parse_header()
 408         # check if BAI is available
 409         self.bai_filename = self.bam_filename + ".bai"
 410         if os.path.exists( self.bai_filename ):
 411             self.baifile = BAIFile( self.bai_filename ) # The baifile is not for IO, it already contains all content.
 412         else:
 413             raise Exception("BAI is not available! Please make sure the .bai")
 414         self.bamfile = io.BufferedReader( io.open( self.bam_filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE )  # binary mode to read the raw BGZF file
 415         self.bgzf_block_cache = b""
 416         self.coffset_cache = 0
 417         self.noffset_cache = 0
 418
 419     cpdef close ( self ):
 420         """Run this when this Parser will be never used.
 421
 422         Close file I/O stream.
 423         """
 424         self.bamfile.close()
 425
 426     cdef __parse_header( self ):
 427         """Parse the HEADER of BAM. It will do the following thing:
 428
 429         1. read in references from BAM header, and save in
 430         self.references, and self.rlengths.
 431
 432         2. invoke self.check_sorted to see if BAM is sorted by
 433         coordinates, and set self.sorted.
 434
 435         3. remember the file pointer at the end of HEADER or the start
 436         of the alignment blocks, and set self.SOA.
 437
 438         """
 439         cdef:
 440             object fhd          # file handler from gzip.open; temporary use
 441             int header_len, x, nc, nlength
 442             bytes magic, refname
 443             list references = []
 444             dict rlengths = {}
 445             bytes header_text
 446
 447         # we use traditional way to read the header -- through gzip
 448         fhd = io.BufferedReader( gzip.open( self.bam_filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE )
 449         # get the first 4 bytes
 450         magic = fhd.read( 4 )
 451         if magic != b"BAM\1":
 452             raise Exception( f"File \"{self.filenmame}\" is not a BAM file. The magic string is not \"BAM\\1\", but \"{magic}\" " )
 453         # next 4bytes is length of header
 454         header_len =  unpack( '<i', fhd.read( 4 ) )[ 0 ]
 455         header_text = unpack( '<%ds' % header_len, fhd.read( header_len ) )[0]
 456         # get the number of chromosome
 457         nc = unpack( '<i', fhd.read( 4 ) )[ 0 ]
 458         for x in range( nc ):
 459             # read each chromosome name
 460             nlength = unpack( '<i', fhd.read( 4 ) )[ 0 ]
 461             refname = fhd.read( nlength )[ :-1 ]
 462             references.append( refname )
 463             # don't jump over chromosome size
 464             # we can use it to avoid falling of chrom ends during peak calling
 465             rlengths[refname] = unpack( '<i', fhd.read( 4 ) )[ 0 ]
 466         self.references = references
 467         self.rlengths = rlengths
 468         # read sorting information
 469         if not self.__check_sorted( header_text ):
 470             raise Exception( f"BAM should be sorted by coordinates!" )
 471         # close
 472         fhd.close()
 473         return
 474
 475     cdef bool __check_sorted( self, bytes header ):
 476         """Check if the BAM is sorted by looking at the HEADER text
 477         """
 478         cdef:
 479             bytes l
 480             int i
 481
 482         # HD line is the first line
 483         l = header.split(b"\n")[0]
 484         if l.startswith(b"@HD"):
 485             # locate HD line
 486             i = l.find(b"SO:")
 487             if i != -1:
 488                 # I will only check the first 5 characters
 489                 if l[i+3:i+8] == b"coord":
 490                     return True
 491         return False
 492
 493     cpdef list get_chromosomes ( self ):
 494         """Get chromosomes in order of their appearance in BAM HEADER.
 495
 496         """
 497         return self.references
 498
 499     cpdef dict get_rlengths ( self ):
 500         """Get chromosomes in order of their appearance in BAM HEADER.
 501
 502         """
 503         return self.rlengths
 504
 505     cdef tuple __decode_voffset ( self, uint64_t voffset ):
 506         """
 507         """
 508         cdef:
 509             uint64_t coffset
 510             uint32_t uoffset
 511         coffset = voffset >> 16
 512         uoffset = voffset ^ (coffset << 16)
 513         return ( coffset, uoffset )
 514
 515     cdef bool __seek ( self, uint64_t offset ):
 516         """Move to given position
 517         """
 518         self.bamfile.seek( offset, 0 )
 519         return True
 520
 521     cdef bool __retrieve_cdata_from_bgzf_block ( self ):
 522         """Retrieve uncompressed CDATA from BGZF block
 523         """
 524         cdef:
 525             uint16_t xlen, bsize
 526             bytes extra, cdata
 527
 528         self.bamfile.seek( 10, 1 ) # skip 10 bytes
 529         # get XLEN
 530         xlen = unpack("<H", self.bamfile.read(2) )[0]
 531         # get extra subfields
 532         extra = self.bamfile.read(xlen)
 533         bsize = unpack( '<H', extra[extra.index(b'BC\x02\x00') + 4 : ])[0]
 534         cdata = self.bamfile.read( bsize - xlen - 19 )
 535         self.bgzf_block_cache = decompress( cdata, -MAX_WBITS )
 536         self.bamfile.seek( 8, 1 ) # move to the end of the block
 537         return True
 538
 539     cpdef list get_reads_in_region ( self, bytes chrom, int left, int right, int maxDuplicate = 1 ):
 540         """Get reads in a given region.
 541
 542         Return: list of ReadAlignment
 543         """
 544         cdef:
 545             int entrylength, fpos, strand, chrid
 546             list readslist
 547             int cur_duplicates = 0
 548             object read
 549             object previous_read
 550             int chrom_index
 551             bytes dcdata
 552             bool flag_end_searching = False
 553             uint64_t coffset
 554
 555         readslist = []
 556         cur_duplicates = 0
 557
 558         previous_read = None
 559         chrom_index = self.references.index( chrom )
 560
 561         # get the coffset from BAI
 562         coffset = self.baifile.get_coffset_by_region ( chrom_index, left, right )
 563         if coffset == 0:
 564             return readslist
 565         if coffset != self.coffset_cache:
 566             self.coffset_cache = coffset
 567             self.__seek( coffset )
 568             self.__retrieve_cdata_from_bgzf_block ( ) # note: self.cached_bgzf_block will be updated.
 569         dcdata = self.bgzf_block_cache
 570
 571         while True:
 572             dcdata_length = len(dcdata)
 573             # now parse the reads in dcdata
 574             i_bytes = 0
 575             while i_bytes < dcdata_length:
 576                 entrylength = unpack( '<I', dcdata[ i_bytes : i_bytes + 4 ] )[ 0 ]
 577                 i_bytes += 4
 578                 # I am not sure if i_bytes+entrylength can be outside of dcdata_length...
 579                 read = self.__fw_binary_parse( dcdata[ i_bytes : i_bytes + entrylength ] )
 580                 i_bytes += entrylength
 581
 582                 if read == None:
 583                     # no mapping information, skip to next one
 584                     continue
 585                 if read["lpos"] > right:
 586                     # outside of region, end searching
 587                     flag_end_searching = True
 588                     break
 589                 elif read["rpos"] > left:
 590                     # found overlap
 591                     # check if redundant
 592                     if previous_read != None and previous_read["lpos"] == read["lpos"] and previous_read["rpos"] == read["rpos"] and previous_read["strand"] ==  read["strand"] and previous_read["cigar"] == read["cigar"]:
 593                         cur_duplicates += 1
 594                     else:
 595                         cur_duplicates = 1
 596                     if cur_duplicates <= maxDuplicate:
 597                         readslist.append( read )
 598                     previous_read = read
 599             if flag_end_searching:
 600                 break
 601             # if not, get the next block, search again
 602             self.coffset_cache = self.bamfile.tell()
 603             self.__retrieve_cdata_from_bgzf_block ()
 604             dcdata = self.bgzf_block_cache
 605             if not dcdata:
 606                 # empty means EOF is reached.
 607                 break
 608         return readslist
 609
 610     cdef object __fw_binary_parse (self, data, min_MAPQ=1 ):
 611         """ Read information from an alignment block. Discard
 612         alignment below a minimum MAPQ score, default: 1.
 613
 614         Return: ReadAlignment object with only selected information,
 615         including refname (chromosome), leftmost alignment location,
 616         strand information, sequence, quality, CIGAR and MD tag.
 617
 618         Note: We do require MD tag exists in BAM file.
 619         """
 620         cdef:
 621             int ref, leftmost, rightmost, i, j, l_seq, strand
 622             short bwflag, l_read_name, n_cigar_op, MAPQ
 623             short bin_bam
 624             bytes read_name
 625             bytes seq #note: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
 626             bytes qual
 627             tuple cigar_op  # op_len<<4|op, op: "MIDNSHP=X" -> 012345678
 628             bytes tag
 629             bytes MD
 630
 631         if not data: return None
 632
 633         # read number of CIGAR operators, Bitwise FLAG
 634         (n_cigar_op,  bwflag ) = unpack( '<HH' , data[ 12:16 ] )
 635
 636         # first, we will discard problematic reads
 637         if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
 638             return None       #unmapped sequence or bad sequence or  secondary or supplementary alignment
 639         if bwflag & 1:
 640             # paired read. We should only keep sequence if the mate is mapped
 641             # Different with other MACS subcommands, both reads will be kept.
 642             if not bwflag & 2:
 643                 return None   # not a proper pair
 644             if bwflag & 8:
 645                 return None   # the mate is unmapped
 646
 647         # read length of readname, MAPQ, and bin information
 648         (l_read_name, MAPQ, bin_bam) = unpack( '<BBH', data[ 8:12 ] )
 649
 650         # we will also discard reads having MAPQ lower than min_MAPQ. MAPQ=255 means no alignment
 651         if MAPQ < min_MAPQ or MAPQ == 255:
 652             return None
 653
 654         # Now we read other information
 655
 656         # read ref name
 657         ref = unpack( '<i', data[ 0:4 ] )[ 0 ]
 658         # read leftmost position of alignment
 659         leftmost = unpack( '<i', data[ 4:8 ] )[ 0 ]
 660         # read length of query sequence
 661         l_seq = unpack( '<i', data[ 16:20 ]  )[0]
 662         # readname , skip next_refID, next_pos, tlen, which is 12 bytes or from the 32th byte
 663         read_name = unpack( '<%ds' % (l_read_name), data[ 32: 32+l_read_name ] )[0][:-1] # last byte is \x00
 664         # cigar_op, find the index i first.
 665         i = 32 + l_read_name
 666         cigar_op = unpack( '<%dI' % (n_cigar_op) , data[ i : i + n_cigar_op*4 ] )
 667         # read sequence information
 668         i += n_cigar_op*4
 669         seq = unpack( '<%ds' % int((l_seq+1)/2), data[ i: i + int((l_seq+1)/2)] )[0]
 670         # read quality information. Note: the value in BAM is the acutal Phred score, there is no +33!
 671         i += int((l_seq+1)/2)
 672         qual = unpack( '<%ds' % (l_seq), data[ i : i + l_seq])[0]
 673
 674         rightmost = leftmost
 675         for j in cigar_op:
 676             if j & 15 in [ 0, 2, 3, 7, 8 ]:   # they are CIGAR op M/D/N/=/X, no need to add I, S, H, or P
 677                 rightmost += j >> 4
 678
 679         # strand information
 680         if bwflag & 16:
 681             # reverse strand
 682             strand = 1
 683         else:
 684             strand = 0
 685
 686         # MD tag
 687         MD = b''
 688         i += l_seq
 689         tag = data[ i : ]
 690         j = tag.find(b'MDZ')
 691         if j == -1: raise MDTagMissingError( data[ 32: 32+l_read_name], tag )
 692         MD = tag[ j+3 : tag[j:].find(b"\0") + j ]
 693
 694         # construct a ReadAlignment object and return
 695         return ReadAlignment( read_name, self.references[ref], leftmost, rightmost, strand, seq, qual, cigar_op, MD )
 696
 697