1 # cython: language_level=3
3 # cython: linetrace=True
4 # Time-stamp: <2021-03-10 23:24:04 Tao Liu>
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the BSD License (see the file LICENSE included with
13 # ------------------------------------
15 # ------------------------------------
17 from logging
import info
, debug
19 from struct import unpack
24 from zlib
import decompress
, MAX_WBITS
26 # ------------------------------------
28 # ------------------------------------
29 from MACS3
.Utilities
.Constants
import *
30 from MACS3
.Signal
.ReadAlignment
import ReadAlignment
32 # ------------------------------------
34 # ------------------------------------
35 from cpython cimport bool
39 from numpy cimport uint8_t
, uint16_t
, uint32_t
, uint64_t
, int8_t
, int16_t
, int32_t
, int64_t
, float32_t
, float64_t
41 is_le
= sys
.byteorder
== "little"
43 cdef extern from "stdlib.h":
44 ctypedef unsigned
int size_t
45 size_t strlen
(char *s
)
46 void *malloc
(size_t size
)
47 void *calloc
(size_t n
, size_t size
)
49 int strcmp
(char *a
, char *b
)
50 char * strcpy
(char *a
, char *b
)
51 long atol
(char *bytes
)
54 # ------------------------------------
56 # ------------------------------------
57 __version__
= "BAM $Revision$"
58 __author__
= "Tao Liu <vladimir.liu@gmail.com>"
59 __doc__
= "SAPPER BAMParser class"
61 # ------------------------------------
63 # ------------------------------------
64 cdef list get_bins_by_region
( uint32_t beg
, uint32_t end
):
65 """ Get the possible bins by given a region
70 # for different levels
71 for k
in range(1 + (beg
>>26), 2 + (end
>>26) ):
73 for k
in range(9 + (beg
>>23), 10 + (end
>>23) ):
75 for k
in range(73 + (beg
>>20), 74 + (end
>>20) ):
77 for k
in range(585 + (beg
>>17), 586 + (end
>>17) ):
79 for k
in range(4681 + (beg
>>14), 4682 + (end
>>14) ):
83 cdef list reg2bins
(rbeg
, rend
):
84 """Generates bin ids which overlap the specified region.
86 rbeg (int): inclusive beginning position of region
87 rend (int): exclusive end position of region
89 (int): bin IDs for overlapping bins of region
91 AssertionError (Exception): if the range is malformed or invalid
93 # Based off the algorithm presented in:
94 # https://samtools.github.io/hts-specs/SAMv1.pdf
96 # Bin calculation constants.
97 BIN_ID_STARTS
= (0, 1, 9, 73, 585, 4681)
99 # Maximum range supported by specifications.
100 MAX_RNG
= (2 ** 29) - 1
102 assert 0 <= rbeg
<= rend
<= MAX_RNG
, 'Invalid region {}, {}'.format
(rbeg
, rend
)
106 for start
, shift
in zip
(BIN_ID_STARTS
, range(29, 13, -3)):
107 i
= rbeg
>> shift
if rbeg
> 0 else 0
108 j
= rend
>> shift
if rend
< MAX_RNG
else MAX_RNG
>> shift
110 for bin_id_offset
in range(i
, j
+ 1):
111 #yield start + bin_id_offset
112 l
.append
( start
+ bin_id_offset
)
115 # ------------------------------------
117 # ------------------------------------
118 class StrandFormatError
( Exception ):
119 """Exception about strand format error.
122 raise StrandFormatError('Must be F or R','X')
124 def __init__
( self, string
, strand
):
128 def __str__
( self ):
129 return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % ( self.string
, self.strand
) )
131 class MDTagMissingError
( Exception ):
132 """Exception about missing MD tag
135 raise MDTagMissingError( name, aux )
137 aux is the auxiliary data part.
139 def __init__
( self, name
, aux
):
143 def __str__
( self ):
144 return repr( "MD tag is missing! Please use \"samtools calmd\" command to add MD tags!\nName of sequence:%s\nCurrent auxiliary data section: %s" % (self.name
.decode
(), self.aux
.decode
() ) )
148 """BAI File Class for BAI (index of BAM) File.
150 While initiating the object, BAI file will be loaded and the
151 information of bins and chunks will be saved in the class object.
153 Please refer to https://samtools.github.io/hts-specs/SAMv1.pdf for
154 detail definition of BAI file.
157 str filename
# filename
158 object fhd
# file handler
159 bytes magic
# magic code for this file
160 uint32_t n_ref
# number of refseqs/chromosomes
161 dict metadata
# metadata for ref seqs
162 uint64_t n_bins
# total number of bins
163 uint64_t n_chunks
# total number of chunks
164 uint64_t n_mapped
# total mapped reads
165 uint64_t n_unmapped
# total unmapped reads
168 def __init__
( self, str filename
):
169 """Open input file. Determine whether it's a gzipped file.
171 'filename' must be a string object.
173 This function initialize the following attributes:
175 1. self.filename: the filename for input file.
176 2. self.gzipped: a boolean indicating whether input file is gzipped.
177 3. self.fhd: buffered I/O stream of input file
179 self.filename
= filename
180 self.fhd
= io
.open( filename
, mode
='rb' ) # binary mode! I don't expect unicode here!
181 self.magic
= self.fhd
.read
( 4 )
182 if self.magic
!= b
"BAI\1":
183 raise Exception(f
"Not a BAI file. The first 4 bytes are \'{self.magic}\'")
184 self.n_ref
= self.__read_n_ref
()
185 self.bins
= list( range( self.n_ref
) )
186 self.metadata
= dict
.fromkeys
( self.bins
)
191 def __cinit__
( self ):
198 def __str__
( self ):
199 tmp
= list(self.bins
[0].keys
())[:10]
202 filename: {self.filename}
204 number of reference sequences/chromosomes: {self.n_ref}
205 number of total bins: {self.n_bins}
206 number of total chunks: {self.n_chunks}
207 number of total mapped reads: {self.n_mapped}
208 number of total unmapped reads: {self.n_unmapped}
209 Example of bins: ref 0, {tmp} ..
210 Example of metadata: ref 0, {self.metadata[ 0 ]}
213 cdef uint32_t __read_n_ref
( self ):
216 self.fhd
.seek
( 4 ) # skip magic
217 ret_val
= unpack
( '<I', self.fhd
.read
( 4 ) )[ 0 ]
220 cdef __load_bins
( self ):
223 uint32_t this_n_bin
, this_n_intv
225 uint32_t this_n_chunk
229 # skip the magic and n_ref
231 # for each ref seq/chromosome
232 for i
in range( self.n_ref
):
234 this_n_bin
= unpack
( '<I', self.fhd
.read
( 4 ) )[ 0 ]
235 # must be less than 37451
236 assert this_n_bin
<= 37451
237 # increment the total n_bins counter
238 self.n_bins
+= this_n_bin
239 # initiate this_bin_dict for this ref seq
242 for j
in range( this_n_bin
):
243 ( this_bin
, this_n_chunk
) = unpack
( '<II', self.fhd
.read
( 8 ) )
244 # increment the total n_chunks counter
245 self.n_chunks
+= this_n_chunk
247 for m
in range( this_n_chunk
):
248 this_chunks
.append
( unpack
( '<qq', self.fhd
.read
( 16 ) ) )
249 # put the list of chunks in the this_bin_dict
250 this_bin_dict
[ this_bin
] = this_chunks
251 # put the this_bin_dict in the list
252 self.bins
[ i
] = this_bin_dict
253 # we will skip the next linear index part -- since I don't
254 # think we need them in regular
255 # ChIP-seq/ATAC-seq/other-seq analysis. Let me know if I am wrong!
256 this_n_intv
= unpack
( '<I', self.fhd
.read
( 4 ) )[ 0 ]
258 self.fhd
.seek
( 8 * this_n_intv
, 1 )
259 # we will also skip n_no_coor, the Number of unplaced unmapped reads
260 # now extract and clean up the pseudo-bins
261 # if empty, just skip
262 for i
in range( self.n_ref
):
264 pseudobin
= self.bins
[ i
].pop
( 37450 )
265 self.metadata
[ i
] = {"ref_beg": pseudobin
[0][0], "ref_end": pseudobin
[0][1],
266 "n_mapped": pseudobin
[1][0], "n_unmapped": pseudobin
[1][1] }
267 self.n_mapped
+= pseudobin
[1][0]
268 self.n_unmapped
+= pseudobin
[1][1]
271 cpdef
list get_chunks_by_bin
( self, uint32_t ref_n
, uint32_t bin_n
):
272 """Get the chunks by bin number, for a given ref seq (not the name,
275 The chunks will be sorted using default python sorted
276 function. Therefore the result will be the order of the offset
277 of beginning of each chunks.
280 return sorted
( self.bins
[ ref_n
].get
( bin_n
, [] ) )
282 cpdef
list get_chunks_by_list_of_bins
( self, uint32_t ref_n
, list bins
):
283 """Similar to get_chunks_by_bin, but accept a list of bins.
285 Note: The redudant bins in the list will be removed.
292 bin_set
= set
( bins
)
293 for bin_n
in bin_set
:
294 chunks
.extend
( self.bins
[ ref_n
].get
( bin_n
, [] ) )
295 return sorted
( chunks
)
297 cpdef dict get_metadata_by_refseq
( self, uint32_t ref_n
):
298 return self.metadata
[ ref_n
]
300 cpdef
list get_chunks_by_region
( self, uint32_t ref_n
, uint32_t beg
, uint32_t end
):
301 """Get the chunks by given a region in a given refseq (not the name,
308 bins
= get_bins_by_region
( beg
, end
)
309 chunks
= self.get_chunks_by_list_of_bins
( ref_n
, bins
)
312 cpdef
list get_chunks_by_list_of_regions
( self, uint32_t ref_n
, list regions
):
313 """ Similar to get_chunks_by_region, but accept a list of regions
321 for i
in range( len(regions
) ):
322 beg
= regions
[ i
][ 0 ]
323 end
= regions
[ i
][ 1 ]
324 temp_bins
= get_bins_by_region
( beg
, end
)
325 bins
.extend
( temp_bins
)
326 return self.get_chunks_by_list_of_bins
( ref_n
, bins
)
328 cpdef uint64_t get_coffset_by_region
( self, uint32_t ref_n
, uint32_t beg
, uint32_t end
):
329 """Find the offset to access the BGZF block which contains the
330 leftmost reads overlapping with the given genomic region.
333 uint64_t voffset_tmp
, coffset_tmp
339 chunks
= self.get_chunks_by_region
( ref_n
, beg
, end
)
342 coffset
= chunks
[0][0] >> 16
343 for i
in range( 1, len(chunks
) ):
344 voffset_tmp
= chunks
[i
][0]
345 coffset_tmp
= voffset_tmp
>> 16
346 if coffset_tmp
< coffset
:
347 coffset
= coffset_tmp
350 cpdef uint64_t get_coffsets_by_list_of_regions
( self, uint32_t ref_n
, list regions
):
351 """Find the offset to access the BGZF block which contains the
352 leftmost reads overlapping with the given genomic region.
361 for i
in range( len(regions
) ):
362 beg
= regions
[ i
][ 0 ]
363 end
= regions
[ i
][ 1 ]
364 coffset
= get_coffset_by_region
( ref_n
, beg
, end
)
365 coffset_list
.append
( coffset
)
368 cdef class BAMaccessor
:
369 """This BAM file reader is designed to access certain alignment
370 records that overlap with givin genomic coordinates.
372 The BAMaccessor needs a matching BAM index file (.bai) for fast
373 access. Please refer to SAM format specification:
375 https://samtools.github.io/hts-specs/SAMv1.pdf
377 Note, the header of BAM will still be read through 'traditional'
378 way -- using gzip to read through. But for accessing specific
379 alignments, we will read BAM as binary, access the compressed
380 chunk then use zlib.decompress.
385 str bam_filename
# BAM filename
386 str bai_filename
# BAI filename
387 object bamfile
# BAM file handler "rb" mode
388 BAIFile baifile
# Matching BAI file
389 list references
# name of references/chromosomes, contain the order of chromosomes
390 dict rlengths
# lengths of references/chromosomes
391 bytes bgzf_block_cache
# cache of decompressed bgzf_block
392 uint64_t coffset_cache
# coffset of the cached bgzf_block
393 uint64_t noffset_cache
# coffset of the next block of the cached bgzf_block
395 def __init__
( self, str BAM_filename
):
396 """Open input file. Determine whether it's a gzipped file.
398 'filename' must be a string object.
400 It will call __parse_header to check if the file is BAM format
401 (through magic string); check if it's sorted by coordinates
402 (SO). It will then check if BAI is available.
405 self.bam_filename
= BAM_filename
407 self.__parse_header
()
408 # check if BAI is available
409 self.bai_filename
= self.bam_filename
+ ".bai"
410 if os
.path
.exists
( self.bai_filename
):
411 self.baifile
= BAIFile
( self.bai_filename
) # The baifile is not for IO, it already contains all content.
413 raise Exception("BAI is not available! Please make sure the .bai")
414 self.bamfile
= io
.BufferedReader
( io
.open( self.bam_filename
, mode
='rb' ), buffer_size
= READ_BUFFER_SIZE
) # binary mode to read the raw BGZF file
415 self.bgzf_block_cache
= b
""
416 self.coffset_cache
= 0
417 self.noffset_cache
= 0
419 cpdef close
( self ):
420 """Run this when this Parser will be never used.
422 Close file I/O stream.
426 cdef __parse_header
( self ):
427 """Parse the HEADER of BAM. It will do the following thing:
429 1. read in references from BAM header, and save in
430 self.references, and self.rlengths.
432 2. invoke self.check_sorted to see if BAM is sorted by
433 coordinates, and set self.sorted.
435 3. remember the file pointer at the end of HEADER or the start
436 of the alignment blocks, and set self.SOA.
440 object fhd
# file handler from gzip.open; temporary use
441 int header_len
, x
, nc
, nlength
447 # we use traditional way to read the header -- through gzip
448 fhd
= io
.BufferedReader
( gzip
.open( self.bam_filename
, mode
='rb' ), buffer_size
= READ_BUFFER_SIZE
)
449 # get the first 4 bytes
450 magic
= fhd
.read
( 4 )
451 if magic
!= b
"BAM\1":
452 raise Exception( f
"File \"{self.filenmame}\" is not a BAM file. The magic string is not \"BAM\\1\", but \"{magic}\" " )
453 # next 4bytes is length of header
454 header_len
= unpack
( '<i', fhd
.read
( 4 ) )[ 0 ]
455 header_text
= unpack
( '<%ds' % header_len
, fhd
.read
( header_len
) )[0]
456 # get the number of chromosome
457 nc
= unpack
( '<i', fhd
.read
( 4 ) )[ 0 ]
458 for x
in range( nc
):
459 # read each chromosome name
460 nlength
= unpack
( '<i', fhd
.read
( 4 ) )[ 0 ]
461 refname
= fhd
.read
( nlength
)[ :-1 ]
462 references
.append
( refname
)
463 # don't jump over chromosome size
464 # we can use it to avoid falling of chrom ends during peak calling
465 rlengths
[refname
] = unpack
( '<i', fhd
.read
( 4 ) )[ 0 ]
466 self.references
= references
467 self.rlengths
= rlengths
468 # read sorting information
469 if not self.__check_sorted
( header_text
):
470 raise Exception( f
"BAM should be sorted by coordinates!" )
475 cdef bool __check_sorted
( self, bytes header
):
476 """Check if the BAM is sorted by looking at the HEADER text
482 # HD line is the first line
483 l
= header
.split
(b
"\n")[0]
484 if l
.startswith
(b
"@HD"):
488 # I will only check the first 5 characters
489 if l
[i
+3:i
+8] == b
"coord":
493 cpdef
list get_chromosomes
( self ):
494 """Get chromosomes in order of their appearance in BAM HEADER.
497 return self.references
499 cpdef dict get_rlengths
( self ):
500 """Get chromosomes in order of their appearance in BAM HEADER.
505 cdef tuple __decode_voffset
( self, uint64_t voffset
):
511 coffset
= voffset
>> 16
512 uoffset
= voffset ^
(coffset
<< 16)
513 return ( coffset
, uoffset
)
515 cdef bool __seek
( self, uint64_t offset
):
516 """Move to given position
518 self.bamfile
.seek
( offset
, 0 )
521 cdef bool __retrieve_cdata_from_bgzf_block
( self ):
522 """Retrieve uncompressed CDATA from BGZF block
528 self.bamfile
.seek
( 10, 1 ) # skip 10 bytes
530 xlen
= unpack
("<H", self.bamfile
.read
(2) )[0]
531 # get extra subfields
532 extra
= self.bamfile
.read
(xlen
)
533 bsize
= unpack
( '<H', extra
[extra
.index
(b
'BC\x02\x00') + 4 : ])[0]
534 cdata
= self.bamfile
.read
( bsize
- xlen
- 19 )
535 self.bgzf_block_cache
= decompress
( cdata
, -MAX_WBITS
)
536 self.bamfile
.seek
( 8, 1 ) # move to the end of the block
539 cpdef
list get_reads_in_region
( self, bytes chrom
, int left
, int right
, int maxDuplicate
= 1 ):
540 """Get reads in a given region.
542 Return: list of ReadAlignment
545 int entrylength
, fpos
, strand
, chrid
547 int cur_duplicates
= 0
552 bool flag_end_searching
= False
559 chrom_index
= self.references
.index
( chrom
)
561 # get the coffset from BAI
562 coffset
= self.baifile
.get_coffset_by_region
( chrom_index
, left
, right
)
565 if coffset
!= self.coffset_cache
:
566 self.coffset_cache
= coffset
567 self.__seek
( coffset
)
568 self.__retrieve_cdata_from_bgzf_block
( ) # note: self.cached_bgzf_block will be updated.
569 dcdata
= self.bgzf_block_cache
572 dcdata_length
= len(dcdata
)
573 # now parse the reads in dcdata
575 while i_bytes
< dcdata_length
:
576 entrylength
= unpack
( '<I', dcdata
[ i_bytes
: i_bytes
+ 4 ] )[ 0 ]
578 # I am not sure if i_bytes+entrylength can be outside of dcdata_length...
579 read
= self.__fw_binary_parse
( dcdata
[ i_bytes
: i_bytes
+ entrylength
] )
580 i_bytes
+= entrylength
583 # no mapping information, skip to next one
585 if read
["lpos"] > right
:
586 # outside of region, end searching
587 flag_end_searching
= True
589 elif read
["rpos"] > left
:
592 if previous_read
!= None
and previous_read
["lpos"] == read
["lpos"] and previous_read
["rpos"] == read
["rpos"] and previous_read
["strand"] == read
["strand"] and previous_read
["cigar"] == read
["cigar"]:
596 if cur_duplicates
<= maxDuplicate
:
597 readslist
.append
( read
)
599 if flag_end_searching
:
601 # if not, get the next block, search again
602 self.coffset_cache
= self.bamfile
.tell
()
603 self.__retrieve_cdata_from_bgzf_block
()
604 dcdata
= self.bgzf_block_cache
606 # empty means EOF is reached.
610 cdef object __fw_binary_parse
(self, data
, min_MAPQ
=1 ):
611 """ Read information from an alignment block. Discard
612 alignment below a minimum MAPQ score, default: 1.
614 Return: ReadAlignment object with only selected information,
615 including refname (chromosome), leftmost alignment location,
616 strand information, sequence, quality, CIGAR and MD tag.
618 Note: We do require MD tag exists in BAM file.
621 int ref
, leftmost
, rightmost
, i
, j
, l_seq
, strand
622 short bwflag
, l_read_name
, n_cigar_op
, MAPQ
625 bytes seq
#note: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
627 tuple cigar_op
# op_len<<4|op, op: "MIDNSHP=X" -> 012345678
631 if not data
: return None
633 # read number of CIGAR operators, Bitwise FLAG
634 (n_cigar_op
, bwflag
) = unpack
( '<HH' , data
[ 12:16 ] )
636 # first, we will discard problematic reads
637 if bwflag
& 4 or bwflag
& 512 or bwflag
& 256 or bwflag
& 2048:
638 return None
#unmapped sequence or bad sequence or secondary or supplementary alignment
640 # paired read. We should only keep sequence if the mate is mapped
641 # Different with other MACS subcommands, both reads will be kept.
643 return None
# not a proper pair
645 return None
# the mate is unmapped
647 # read length of readname, MAPQ, and bin information
648 (l_read_name
, MAPQ
, bin_bam
) = unpack
( '<BBH', data
[ 8:12 ] )
650 # we will also discard reads having MAPQ lower than min_MAPQ. MAPQ=255 means no alignment
651 if MAPQ
< min_MAPQ
or MAPQ
== 255:
654 # Now we read other information
657 ref
= unpack
( '<i', data
[ 0:4 ] )[ 0 ]
658 # read leftmost position of alignment
659 leftmost
= unpack
( '<i', data
[ 4:8 ] )[ 0 ]
660 # read length of query sequence
661 l_seq
= unpack
( '<i', data
[ 16:20 ] )[0]
662 # readname , skip next_refID, next_pos, tlen, which is 12 bytes or from the 32th byte
663 read_name
= unpack
( '<%ds' % (l_read_name
), data
[ 32: 32+l_read_name
] )[0][:-1] # last byte is \x00
664 # cigar_op, find the index i first.
666 cigar_op
= unpack
( '<%dI' % (n_cigar_op
) , data
[ i
: i
+ n_cigar_op
*4 ] )
667 # read sequence information
669 seq
= unpack
( '<%ds' % int((l_seq
+1)/2), data
[ i
: i
+ int((l_seq
+1)/2)] )[0]
670 # read quality information. Note: the value in BAM is the acutal Phred score, there is no +33!
671 i
+= int((l_seq
+1)/2)
672 qual
= unpack
( '<%ds' % (l_seq
), data
[ i
: i
+ l_seq
])[0]
676 if j
& 15 in [ 0, 2, 3, 7, 8 ]: # they are CIGAR op M/D/N/=/X, no need to add I, S, H, or P
691 if j
== -1: raise MDTagMissingError
( data
[ 32: 32+l_read_name
], tag
)
692 MD
= tag
[ j
+3 : tag
[j
:].find
(b
"\0") + j
]
694 # construct a ReadAlignment object and return
695 return ReadAlignment
( read_name
, self.references
[ref
], leftmost
, rightmost
, strand
, seq
, qual
, cigar_op
, MD
)