1 # cython: language_level=3
3 # cython: linetrace=True
4 # Time-stamp: <2020-12-06 23:32:28 Tao Liu>
6 """Module for all MACS Parser classes for input.
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the BSD License (see the file LICENSE included with
13 # ------------------------------------
15 # ------------------------------------
17 from logging
import info
, debug
19 from struct import unpack
20 from re
import findall
25 # ------------------------------------
27 # ------------------------------------
29 from MACS3
.Utilities
.Constants
import *
30 from MACS3
.Signal
.FixWidthTrack
import FWTrack
31 from MACS3
.Signal
.PairedEndTrack
import PETrackI
33 # ------------------------------------
35 # ------------------------------------
36 from cpython cimport bool
40 from numpy cimport uint8_t
, uint16_t
, uint32_t
, uint64_t
, int8_t
, int16_t
, int32_t
, int64_t
, float32_t
, float64_t
42 is_le
= sys
.byteorder
== "little"
44 cdef extern from "stdlib.h":
45 ctypedef uint32_t size_t
46 size_t strlen
(char *s
)
47 void *malloc
(size_t size
)
48 void *calloc
(size_t n
, size_t size
)
50 int32_t strcmp
(char *a
, char *b
)
51 char * strcpy
(char *a
, char *b
)
52 int64_t atol
(char *str)
53 int32_t atoi
(char *str)
55 # ------------------------------------
57 # ------------------------------------
58 __version__
= "Parser $Revision$"
59 __author__
= "Tao Liu <taoliu@jimmy.harvard.edu>"
60 __doc__
= "All Parser classes"
62 # ------------------------------------
64 # ------------------------------------
66 cpdef guess_parser
( fname
, int64_t buffer_size
= 100000 ):
67 # Note: BAMPE and BEDPE can't be automatically detected.
68 ordered_parser_dict
= {"BAM":BAMParser
,
70 "ELAND":ELANDResultParser
,
71 "ELANDMULTI":ELANDMultiParser
,
72 "ELANDEXPORT":ELANDExportParser
,
74 "BOWTIE":BowtieParser
}
76 for f
in ordered_parser_dict
:
77 p
= ordered_parser_dict
[ f
]
78 t_parser
= p
( fname
, buffer_size
= buffer_size
)
79 debug
( "Testing format %s" % f
)
82 info
( "Detected format is: %s" % ( f
) )
83 if t_parser
.is_gzipped
():
84 info
( "* Input file is gzipped." )
88 raise Exception( "Can't detect format!" )
90 cdef tuple __bam_fw_binary_parse_le
( const unsigned
char * data
):
91 """Parse a BAM SE entry in little endian system
94 int32_t thisref
, thisstart
, thisstrand
104 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
105 # no data, return, does it really happen without raising struct.error?
106 if not data
: return ( -1, -1, -1 )
108 ui16
= <uint16_t
*>data
111 # we filter out unmapped sequence or bad sequence or secondary or supplementary alignment
112 # we filter out 2nd mate, not a proper pair, mate is unmapped
113 if (bwflag
& 2820) or (bwflag
& 1 and (bwflag
& 136 or not bwflag
& 2)): return ( -1, -1, -1 )
115 i32
= <int32_t
*>data
119 # In case of paired-end we have now skipped all possible "bad" pairs
120 # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
121 # we can treat it as a single read, so just check the strand and calculate its
122 # start position... hope I'm right!
124 # read mapped to minus strand; then we have to compute cigar to find the rightmost position
125 ui8
= <uint8_t
*>data
127 # need to decipher CIGAR string
128 ui32
= <uint32_t
*>(data
+ 32 + l_read_name
) # move pointer to cigar_code
129 for cigar_code
in ui32
[:n_cigar_op
]:#unpack( '<%dI' % (n_cigar_op) , data[ 32 + l_read_name : 32 + l_read_name + n_cigar_op*4 ] ):
130 if cigar_code
& 15 in [ 0, 2, 3, 7, 8 ]: # they are CIGAR op M/D/N/=/X
131 thisstart
+= cigar_code
>> 4
136 return ( thisref
, thisstart
, thisstrand
)
138 cdef tuple __bam_fw_binary_parse_be
( const unsigned
char * data
):
139 """Big endian version. We need byte swap.
142 int32_t thisref
, thisstart
, thisstrand
147 uint8_t
*ui8
# we will only cast 1 byte at a time
149 uint32_t shift0
, shift
152 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
153 # no data, return, does it really happen without raising struct.error?
154 if not data
: return ( -1, -1, -1 )
156 ui8
= <uint8_t
*>data
157 bwflag
= ui8
[15] << 8 | ui8
[14] # it works as bwflag = ui16[7] in little-endian
159 # we filter out unmapped sequence or bad sequence or secondary or supplementary alignment
160 # we filter out 2nd mate, not a proper pair, mate is unmapped
161 if (bwflag
& 2820) or (bwflag
& 1 and (bwflag
& 136 or not bwflag
& 2)): return ( -1, -1, -1 )
163 # the following three lins are for little-endian
166 #n_cigar_op = ui16[6]
168 # to simplify the byte swap, we pretend all original numbers (thisref, pos, nextpos) positive
169 thisref
= ui8
[3] << 24 | ui8
[2] << 16 | ui8
[1] << 8 | ui8
[0]
170 thisstart
= ui8
[7] << 24 | ui8
[6] << 16 | ui8
[5] << 8 | ui8
[4]
171 n_cigar_op
= ui8
[13] << 8 | i8
[12]
173 # In case of paired-end we have now skipped all possible "bad" pairs
174 # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
175 # we can treat it as a single read, so just check the strand and calculate its
176 # start position... hope I'm right!
178 # read mapped to minus strand; then we have to compute cigar to find the rightmost position
180 # need to decipher CIGAR string
181 # move pointer to cigar_code
182 shift0
= 32 + l_read_name
183 for i
in range(n_cigar_op
):
184 shift
= shift0
+ i
*4 # move 32bit at a time
185 cigar_code
= ui8
[shift0
+3] << 24 | ui8
[shift0
+2] << 16 | ui8
[shift0
+1] << 8 | ui8
[shift0
] # it works like cigar_code = ui32[...] in little-endian
186 if cigar_code
& 15 in [ 0, 2, 3, 7, 8 ]: # they are CIGAR op M/D/N/=/X
187 thisstart
+= cigar_code
>> 4
192 return ( thisref
, thisstart
, thisstrand
)
194 cdef tuple __bampe_pe_binary_parse_le
(const unsigned
char * data
):
195 """Parse a BAMPE record in little-endian system.
198 int32_t thisref
, thisstart
, thistlen
205 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
207 return ( -1, -1, -1 )
209 ui16
= <uint16_t
*>data
211 # we filter out unmapped, bad sequence, secondary/supplementary alignment
212 # we filter out other mate of paired reads, not a proper pair, or mate is unmapped
213 if (bwflag
& 2820) or (bwflag
& 1 and (bwflag
& 136 or not bwflag
& 2)):
214 return ( -1, -1, -1 )
216 i32
= <int32_t
*>data
217 ui8
= <uint8_t
*>data
223 thisstart
= min(pos
, nextpos
) # we keep only the leftmost
224 # position which means this must
225 # be at + strand. So we don't
226 # need to decipher CIGAR string.
227 thistlen
= abs( thistlen
) # Actually, if
235 return ( thisref
, thisstart
, thistlen
)
237 cdef tuple __bampe_pe_binary_parse_be
(const unsigned
char * data
):
238 """Parse a BAMPE record in big-endian system. And we need byte swap.
241 int32_t thisref
, thisstart
, thistlen
242 uint32_t tmp_thistlen
245 uint8_t
*ui8
# we will only cast 1 byte at a time
247 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
249 return ( -1, -1, -1 )
251 ui8
= <uint8_t
*>data
253 bwflag
= ui8
[15] << 8 | ui8
[14] # as le: bwflag = ui16[7]
254 # we filter out unmapped, bad sequence, secondary/supplementary alignment
255 # we filter out other mate of paired reads, not a proper pair, or mate is unmapped
256 if (bwflag
& 2820) or (bwflag
& 1 and (bwflag
& 136 or not bwflag
& 2)):
257 return ( -1, -1, -1 )
261 # the following three lins are for little-endian
265 # to simplify the byte swap, we pretend all original numbers (thisref, pos, nextpos) positive
266 thisref
= ui8
[3] << 24 | ui8
[2] << 16 | ui8
[1] << 8 | ui8
[0] # as le:thisref = i32[0]
267 pos
= ui8
[7] << 24 | ui8
[6] << 16 | ui8
[5] << 8 | ui8
[4] # as le:pos = i32[1]
268 nextpos
= ui8
[27] << 24 | ui8
[26] << 16 | ui8
[25] << 8 | ui8
[24] # as le:nextpos = i32[6]
270 # thistlen can be negative, so we byte swap it then convert to int32_t then take abs (maybe there is more effecient way?)
271 tmp_thistlen
= ui8
[31] << 24 | ui8
[30] << 16 | ui8
[29] << 8 | ui8
[28] # as le:tmp_thistlen = ui32[7]
272 thistlen
= abs(<int32_t
> tmp_thistlen
)
274 # position which means this must
275 # be at + strand. So we don't
276 # need to decipher CIGAR string.
277 thisstart
= pos
if nextpos
> pos
else nextpos
#min(pos, nextpos) # we keep only the leftmost
279 return ( thisref
, thisstart
, thistlen
)
282 # choose a parser according to endian
284 bam_se_entry_parser
= __bam_fw_binary_parse_le
285 bampe_pe_entry_parser
= __bampe_pe_binary_parse_le
287 bam_se_entry_parser
= __bam_fw_binary_parse_be
288 bampe_pe_entry_parser
= __bampe_pe_binary_parse_be
290 # ------------------------------------
292 # ------------------------------------
293 class StrandFormatError
( Exception ):
294 """Exception about strand format error.
297 raise StrandFormatError('Must be F or R','X')
299 def __init__
( self, string
, strand
):
303 def __str__
( self ):
304 return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % ( self.string
, self.strand
) )
306 cdef class GenericParser
:
307 """Generic Parser class.
309 Inherit this class to write your own parser. In most cases, you need to override:
311 1. __tlen_parse_line which returns tag length of a line
312 2. __fw_parse_line which returns tuple of ( chromosome, 5'position, strand )
321 def __init__
( self, str filename
, int64_t buffer_size
= 100000 ):
322 """Open input file. Determine whether it's a gzipped file.
324 'filename' must be a string object.
326 This function initialize the following attributes:
328 1. self.filename: the filename for input file.
329 2. self.gzipped: a boolean indicating whether input file is gzipped.
330 3. self.fhd: buffered I/O stream of input file
332 self.filename
= filename
335 self.buffer_size
= buffer_size
337 f
= gzip
.open( filename
)
345 # open with gzip.open, then wrap it with BufferedReader!
346 self.fhd
= io
.BufferedReader
( gzip
.open( filename
, mode
='rb' ), buffer_size
= READ_BUFFER_SIZE
) # buffersize set to 10M
348 self.fhd
= io
.open( filename
, mode
='rb' ) # binary mode! I don't expect unicode here!
349 self.__skip_first_commentlines
()
351 cdef void __skip_first_commentlines
( self ):
352 """Some parser needs to skip the first several comment lines.
354 Redefine this if it's necessary!
358 cpdef int32_t tsize
( self ):
359 """General function to detect tag size.
361 * Although it can be used by most parsers, it must be
362 rewritten by BAMParser!
366 int32_t n
= 0 # number of successful/valid read alignments
367 int32_t m
= 0 # number of trials
368 int32_t this_taglength
371 if self.tag_size
!= -1:
372 # if we have already calculated tag size (!= -1), return it.
375 # try 10k times or retrieve 10 successfule alignments
376 while n
< 10 and m
< 10000:
378 thisline
= self.fhd
.readline
()
379 this_taglength
= self.__tlen_parse_line
( thisline
)
380 if this_taglength
> 0:
381 # this_taglength == 0 means this line doesn't contain
382 # successful alignment.
387 self.__skip_first_commentlines
()
388 if n
!= 0: # else tsize = -1
389 self.tag_size
= <int32_t
>(s
/n
)
392 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
393 """Abstract function to detect tag length.
398 cpdef build_fwtrack
( self ):
399 """Generic function to build FWTrack object. Create a new
400 FWTrack object. If you want to append new records to an
401 existing FWTrack object, try append_fwtrack function.
403 * BAMParser for binary BAM format should have a different one.
406 int64_t i
, fpos
, strand
410 fwtrack
= FWTrack
( buffer_size
= self.buffer_size
)
413 # for each block of input
414 tmp
+= self.fhd
.read
( READ_BUFFER_SIZE
)
417 lines
= tmp
.split
(b
"\n")
419 for thisline
in lines
[ :-1 ]:
420 ( chromosome
, fpos
, strand
) = self.__fw_parse_line
( thisline
)
421 if fpos
< 0 or not chromosome
:
422 # normally __fw_parse_line will return -1 if the line
423 # contains no successful alignment.
427 info
( " %d reads parsed" % i
)
428 fwtrack
.add_loc
( chromosome
, fpos
, strand
)
431 ( chromosome
, fpos
, strand
) = self.__fw_parse_line
( tmp
)
432 if fpos
>= 0 and chromosome
:
434 fwtrack
.add_loc
( chromosome
, fpos
, strand
)
439 cpdef append_fwtrack
( self, fwtrack
):
440 """Add more records to an existing FWTrack object.
444 int64_t i
, fpos
, strand
449 # for each block of input
450 tmp
+= self.fhd
.read
( READ_BUFFER_SIZE
)
453 lines
= tmp
.split
(b
"\n")
455 for thisline
in lines
[ :-1 ]:
456 ( chromosome
, fpos
, strand
) = self.__fw_parse_line
( thisline
)
457 if fpos
< 0 or not chromosome
:
458 # normally __fw_parse_line will return -1 if the line
459 # contains no successful alignment.
463 info
( " %d reads parsed" % i
)
464 fwtrack
.add_loc
( chromosome
, fpos
, strand
)
468 ( chromosome
, fpos
, strand
) = self.__fw_parse_line
( tmp
)
469 if fpos
>= 0 and chromosome
:
471 fwtrack
.add_loc
( chromosome
, fpos
, strand
)
476 cdef tuple __fw_parse_line
( self, bytes thisline
):
477 """Abstract function to parse chromosome, 5' end position and
481 cdef bytes chromosome
= b
""
482 cdef int32_t fpos
= -1
483 cdef int32_t strand
= -1
484 return ( chromosome
, fpos
, strand
)
486 cpdef sniff
( self ):
487 """Detect whether this parser is the correct parser for input
490 Rule: try to find the tag size using this parser, if error
491 occurs or tag size is too small or too big, check is failed.
493 * BAMParser has a different sniff function.
498 if t
<= 10 or t
>= 10000: # tsize too small or too big
503 self.__skip_first_commentlines
()
506 cpdef close
( self ):
507 """Run this when this Parser will be never used.
509 Close file I/O stream.
513 cpdef bool is_gzipped
( self ):
516 cdef class BEDParser
( GenericParser
):
517 """File Parser Class for BED File.
520 cdef void __skip_first_commentlines
( self ):
521 """BEDParser needs to skip the first several comment lines.
526 for thisline
in self.fhd
:
527 l_line
= len( thisline
)
528 if thisline
and ( thisline
[ :5 ] != b
"track" ) \
529 and ( thisline
[ :7 ] != b
"browser" ) \
530 and ( thisline
[ 0 ] != 35 ): # 35 is b"#"
533 # rewind from SEEK_CUR
534 self.fhd
.seek
( -l_line
, 1 )
537 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
538 """Parse 5' and 3' position, then calculate frag length.
541 thisline
= thisline
.rstrip
()
545 thisfields
= thisline
.split
( b
'\t' )
546 return atoi
( thisfields
[ 2 ] )-atoi
( thisfields
[ 1 ] )
548 cdef tuple __fw_parse_line
( self, bytes thisline
):
549 #cdef list thisfields
554 thisline
= thisline
.rstrip
()
555 thisfields
= thisline
.split
( b
'\t' )
556 chromname
= thisfields
[ 0 ]
558 #if not strcmp(thisfields[ 5 ],b"+"):
559 if thisfields
[5] == b
"+":
561 atoi
( thisfields
[ 1 ] ),
563 #elif not strcmp(thisfields[ 5 ], b"-"):
564 elif thisfields
[5] == b
"-":
566 atoi
( thisfields
[ 2 ] ),
569 raise StrandFormatError
( thisline
, thisfields
[ 5 ] )
571 # default pos strand if no strand
574 atoi
( thisfields
[ 1 ] ),
577 cdef class BEDPEParser
(GenericParser
):
578 """Parser for BED format file containing PE information, and also
579 can be used for the cases when users predefine the fragment
580 locations by shifting/extending by themselves.
584 chromosome_name frag_leftend frag_rightend
587 Note: Only the first columns are used!
590 cdef public int32_t n
591 cdef public float32_t d
593 cdef void __skip_first_commentlines
( self ):
594 """BEDPEParser needs to skip the first several comment lines.
599 for thisline
in self.fhd
:
600 l_line
= len( thisline
)
601 if thisline
and ( thisline
[ :5 ] != b
"track" ) \
602 and ( thisline
[ :7 ] != b
"browser" ) \
603 and ( thisline
[ 0 ] != 35 ): # 35 is b"#"
606 # rewind from SEEK_CUR
607 self.fhd
.seek
( -l_line
, 1 )
610 cdef __pe_parse_line
( self, bytes thisline
):
611 """ Parse each line, and return chromosome, left and right positions
617 thisline
= thisline
.rstrip
()
619 # still only support tabular as delimiter.
620 thisfields
= thisline
.split
( b
'\t' )
622 return ( thisfields
[ 0 ],
623 atoi
( thisfields
[ 1 ] ),
624 atoi
( thisfields
[ 2 ] ) )
626 raise Exception("Less than 3 columns found at this line: %s\n" % thisline
)
628 cpdef build_petrack
( self ):
629 """Build PETrackI from all lines.
636 int64_t i
= 0 # number of fragments
637 int64_t m
= 0 # sum of fragment lengths
640 petrack
= PETrackI
( buffer_size
= self.buffer_size
)
641 add_loc
= petrack
.add_loc
644 # for each block of input
645 tmp
+= self.fhd
.read
( READ_BUFFER_SIZE
)
648 lines
= tmp
.split
(b
"\n")
650 for thisline
in lines
[ :-1 ]:
651 ( chromosome
, left_pos
, right_pos
) = self.__pe_parse_line
( thisline
)
652 if left_pos
< 0 or not chromosome
:
654 assert right_pos
> left_pos
, "Right position must be larger than left position, check your BED file at line: %s" % thisline
655 m
+= right_pos
- left_pos
658 info
( " %d fragments parsed" % i
)
659 add_loc
( chromosome
, left_pos
, right_pos
)
662 ( chromosome
, left_pos
, right_pos
) = self.__pe_parse_line
( thisline
)
663 if left_pos
>= 0 and chromosome
:
664 assert right_pos
> left_pos
, "Right position must be larger than left position, check your BED file at line: %s" % thisline
666 m
+= right_pos
- left_pos
667 add_loc
( chromosome
, left_pos
, right_pos
)
669 self.d
= <float32_t
>( m
) / i
671 assert self.d
>= 0, "Something went wrong (mean fragment size was negative)"
674 petrack
.set_rlengths
( {"DUMMYCHROM":0} )
677 cpdef append_petrack
(self, petrack
):
678 """Build PETrackI from all lines, return a PETrackI object.
684 int64_t i
= 0 # number of fragments
685 int64_t m
= 0 # sum of fragment lengths
688 add_loc
= petrack
.add_loc
690 # for each block of input
691 tmp
+= self.fhd
.read
( READ_BUFFER_SIZE
)
694 lines
= tmp
.split
(b
"\n")
696 for thisline
in lines
[ :-1 ]:
697 ( chromosome
, left_pos
, right_pos
) = self.__pe_parse_line
( thisline
)
698 if left_pos
< 0 or not chromosome
:
700 assert right_pos
> left_pos
, "Right position must be larger than left position, check your BED file at line: %s" % thisline
701 m
+= right_pos
- left_pos
704 info
( " %d fragments parsed" % i
)
705 add_loc
( chromosome
, left_pos
, right_pos
)
708 ( chromosome
, left_pos
, right_pos
) = self.__pe_parse_line
( thisline
)
709 if left_pos
>= 0 and chromosome
:
710 assert right_pos
> left_pos
, "Right position must be larger than left position, check your BED file at line: %s" % thisline
712 m
+= right_pos
- left_pos
713 add_loc
( chromosome
, left_pos
, right_pos
)
715 self.d
= ( self.d
* self.n
+ m
) / ( self.n
+ i
)
718 assert self.d
>= 0, "Something went wrong (mean fragment size was negative)"
720 petrack
.set_rlengths
( {"DUMMYCHROM":0} )
723 cdef class ELANDResultParser
( GenericParser
):
724 """File Parser Class for tabular File.
727 cdef void __skip_first_commentlines
( self ):
728 """ELANDResultParser needs to skip the first several comment lines.
733 for thisline
in self.fhd
:
734 l_line
= len( thisline
)
735 if thisline
and thisline
[ 0 ] != 35: # 35 is b"#"
738 # rewind from SEEK_CUR
739 self.fhd
.seek
( -l_line
, 1 )
742 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
743 """Parse tag sequence, then tag length.
746 thisline
= thisline
.rstrip
()
747 if not thisline
: return 0
748 thisfields
= thisline
.split
( b
'\t' )
749 if thisfields
[1].isdigit
():
752 return len( thisfields
[ 1 ] )
754 cdef tuple __fw_parse_line
( self, bytes thisline
):
756 bytes chromname
, strand
757 int32_t thistaglength
758 thisline
= thisline
.rstrip
()
759 if not thisline
: return ( b
"", -1, -1 )
761 thisfields
= thisline
.split
( b
'\t' )
762 thistaglength
= strlen
( thisfields
[ 1 ] )
764 if len( thisfields
) <= 6:
765 return ( b
"", -1, -1 )
768 chromname
= thisfields
[ 6 ]
769 chromname
= chromname
[ :chromname
.rindex
( b
".fa" ) ]
773 if thisfields
[ 2 ] == b
"U0" or thisfields
[ 2 ] == b
"U1" or thisfields
[ 2 ] == b
"U2":
774 # allow up to 2 mismatches...
775 strand
= thisfields
[ 8 ]
778 atoi
( thisfields
[ 7 ] ) - 1,
782 atoi
( thisfields
[ 7 ] ) + thistaglength
- 1,
785 raise StrandFormatError
( thisline
, strand
)
787 return ( b
"", -1, -1 )
789 cdef class ELANDMultiParser
( GenericParser
):
790 """File Parser Class for ELAND multi File.
792 Note this parser can only work for s_N_eland_multi.txt format.
794 Each line of the output file contains the following fields:
797 3. Either NM, QC, RM (as described above) or the following:
798 4. x:y:z where x, y, and z are the number of exact, single-error, and 2-error matches
800 5. Blank, if no matches found or if too many matches found, or the following:
801 BAC_plus_vector.fa:163022R1,170128F2,E_coli.fa:3909847R1
802 This says there are two matches to BAC_plus_vector.fa: one in the reverse direction
803 starting at position 160322 with one error, one in the forward direction starting at
804 position 170128 with two errors. There is also a single-error match to E_coli.fa.
806 cdef void __skip_first_commentlines
( self ):
807 """ELANDResultParser needs to skip the first several comment lines.
812 for thisline
in self.fhd
:
813 l_line
= len( thisline
)
814 if thisline
and thisline
[ 0 ] != 35: # 35 is b"#"
817 # rewind from SEEK_CUR
818 self.fhd
.seek
( -l_line
, 1 )
821 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
822 """Parse tag sequence, then tag length.
825 thisline
= thisline
.rstrip
()
826 if not thisline
: return 0
827 thisfields
= thisline
.split
( b
'\t' )
828 if thisfields
[1].isdigit
():
831 return len( thisfields
[ 1 ] )
833 cdef tuple __fw_parse_line
( self, bytes thisline
):
836 bytes thistagname
, pos
, strand
837 int32_t thistaglength
, thistaghits
839 if not thisline
: return ( b
"", -1, -1 )
840 thisline
= thisline
.rstrip
()
841 if not thisline
: return ( b
"", -1, -1 )
843 thisfields
= thisline
.split
( b
'\t' )
844 thistagname
= thisfields
[ 0 ] # name of tag
845 thistaglength
= len( thisfields
[ 1 ] ) # length of tag
847 if len( thisfields
) < 4:
848 return ( b
"", -1, -1 )
850 thistaghits
= sum
( [<int32_t
>(x
) for x
in thisfields
[ 2 ].split
( b
':' ) ] )
853 return ( b
"", -1, -1 )
855 ( chromname
, pos
) = thisfields
[ 3 ].split
( b
':' )
858 chromname
= chromname
[ :chromname
.rindex
( b
".fa" ) ]
865 <int32_t
>( pos
[ :-2 ] )-1,
869 <int32_t
>( pos
[ :-2 ] ) + thistaglength
- 1,
872 raise StrandFormatError
( thisline
,strand
)
875 cdef class ELANDExportParser
( GenericParser
):
876 """File Parser Class for ELAND Export File.
879 cdef void __skip_first_commentlines
( self ):
880 """ELANDResultParser needs to skip the first several comment lines.
885 for thisline
in self.fhd
:
886 l_line
= len( thisline
)
887 if thisline
and thisline
[ 0 ] != 35: # 35 is b"#"
890 # rewind from SEEK_CUR
891 self.fhd
.seek
( -l_line
, 1 )
894 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
895 """Parse tag sequence, then tag length.
898 thisline
= thisline
.rstrip
()
899 if not thisline
: return 0
900 thisfields
= thisline
.split
( b
'\t' )
901 if len( thisfields
) > 12 and thisfields
[ 12 ]:
902 # a successful alignment has over 12 columns
903 return len( thisfields
[ 8 ] )
907 cdef tuple __fw_parse_line
( self, bytes thisline
):
910 bytes thisname
, strand
911 int32_t thistaglength
913 thisline
= thisline
.rstrip
()
914 if not thisline
: return ( b
"", -1, -1 )
916 thisfields
= thisline
.split
( b
"\t" )
918 if len(thisfields
) > 12 and thisfields
[ 12 ]:
919 thisname
= b
":".join
( thisfields
[ 0:6 ] )
920 thistaglength
= len( thisfields
[ 8 ] )
921 strand
= thisfields
[ 13 ]
923 return ( thisfields
[ 10 ], atoi
( thisfields
[ 12 ] ) - 1, 0 )
925 return ( thisfields
[ 10 ], atoi
( thisfields
[ 12 ] ) + thistaglength
- 1, 1 )
927 raise StrandFormatError
( thisline
, strand
)
929 return ( b
"", -1, -1 )
931 ### Contributed by Davide, modified by Tao
932 cdef class SAMParser
( GenericParser
):
933 """File Parser Class for SAM File.
935 Each line of the output file contains at least:
939 4. 1-based leftmost position fo clipped alignment
942 7. Mate Reference Name
943 8. 1-based leftmost Mate Position
944 9. Inferred insert size
945 10. Query sequence on the same strand as the reference
948 The bitwise flag is made like this:
955 16 strand of the query (1 -> reverse)
956 32 strand of the mate
957 64 first read in pair
958 128 second read in pair
959 256 alignment is not primary
960 512 does not pass quality check
961 1024 PCR or optical duplicate
962 2048 supplementary alignment
965 cdef void __skip_first_commentlines
( self ):
966 """SAMParser needs to skip the first several comment lines.
971 for thisline
in self.fhd
:
972 l_line
= len( thisline
)
973 if thisline
and thisline
[ 0 ] != 64: # 64 is b"@"
976 # rewind from SEEK_CUR
977 self.fhd
.seek
( -l_line
, 1 )
980 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
981 """Parse tag sequence, then tag length.
988 thisline
= thisline
.rstrip
()
989 if not thisline
: return 0
990 thisfields
= thisline
.split
( b
'\t' )
991 bwflag
= atoi
( thisfields
[ 1 ] )
992 if bwflag
& 4 or bwflag
& 512 or bwflag
& 256 or bwflag
& 2048:
993 return 0 #unmapped sequence or bad sequence or 2nd or sup alignment
995 # paired read. We should only keep sequence if the mate is mapped
996 # and if this is the left mate, all is within the flag!
998 return 0 # not a proper pair
1000 return 0 # the mate is unmapped
1001 # From Benjamin Schiller https://github.com/benjschiller
1003 # this is not the first read in a pair
1005 return len( thisfields
[ 9 ] )
1007 cdef tuple __fw_parse_line
( self, bytes thisline
):
1010 bytes thistagname
, thisref
1011 int32_t bwflag
, thisstrand
, thisstart
1013 thisline
= thisline
.rstrip
()
1014 if not thisline
: return ( b
"", -1, -1 )
1015 thisfields
= thisline
.split
( b
'\t' )
1016 thistagname
= thisfields
[ 0 ] # name of tag
1017 thisref
= thisfields
[ 2 ]
1018 bwflag
= atoi
( thisfields
[ 1 ] )
1019 CIGAR
= thisfields
[ 5 ]
1021 if (bwflag
& 2820) or (bwflag
& 1 and (bwflag
& 136 or not bwflag
& 2)): return ( b
"", -1, -1 )
1023 #if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
1024 # return ( b"", -1, -1 ) #unmapped sequence or bad sequence or 2nd or sup alignment
1026 # # paired read. We should only keep sequence if the mate is mapped
1027 # # and if this is the left mate, all is within the flag!
1028 # if not bwflag & 2:
1029 # return ( b"", -1, -1 ) # not a proper pair
1031 # return ( b"", -1, -1 ) # the mate is unmapped
1032 # # From Benjamin Schiller https://github.com/benjschiller
1034 # # this is not the first read in a pair
1035 # return ( b"", -1, -1 )
1036 # # end of the patch
1037 # In case of paired-end we have now skipped all possible "bad" pairs
1038 # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
1039 # we can treat it as a single read, so just check the strand and calculate its
1040 # start position... hope I'm right!
1042 # minus strand, we have to decipher CIGAR string
1044 thisstart
= atoi
( thisfields
[ 3 ] ) - 1 + sum
( [ <int32_t
>(x
) for x
in findall
(b
"(\d+)[MDNX=]",CIGAR
) ] ) #reverse strand should be shifted alen bp
1047 thisstart
= atoi
( thisfields
[ 3 ] ) - 1
1050 thisref
= thisref
[ :thisref
.rindex
( b
".fa" ) ]
1054 return ( thisref
, thisstart
, thisstrand
)
1056 cdef class BAMParser
( GenericParser
):
1057 """File Parser Class for BAM File.
1059 File is gzip-compatible and binary.
1060 Information available is the same that is in SAM format.
1062 The bitwise flag is made like this:
1069 16 strand of the query (1 -> reverse)
1070 32 strand of the mate
1071 64 first read in pair
1072 128 second read in pair
1073 256 alignment is not primary
1074 512 does not pass quality check
1075 1024 PCR or optical duplicate
1076 2048 supplementary alignment
1078 def __init__
( self, str filename
, int64_t buffer_size
= 100000 ):
1079 """Open input file. Determine whether it's a gzipped file.
1081 'filename' must be a string object.
1083 This function initialize the following attributes:
1085 1. self.filename: the filename for input file.
1086 2. self.gzipped: a boolean indicating whether input file is gzipped.
1087 3. self.fhd: buffered I/O stream of input file
1089 self.filename
= filename
1092 self.buffer_size
= buffer_size
1094 f
= gzip
.open( filename
)
1098 # not a gzipped file
1099 self.gzipped
= False
1102 # open with gzip.open, then wrap it with BufferedReader!
1103 self.fhd
= io
.BufferedReader
( gzip
.open( filename
, mode
='rb' ), buffer_size
= READ_BUFFER_SIZE
) # buffersize set to 1M
1105 self.fhd
= io
.open( filename
, mode
='rb' ) # binary mode! I don't expect unicode here!
1107 cpdef sniff
( self ):
1108 """Check the first 3 bytes of BAM file. If it's 'BAM', check
1112 magic_header
= self.fhd
.read
( 3 )
1113 if magic_header
== b
"BAM":
1114 tsize
= self.tsize
()
1120 raise Exception( "File is not of a valid BAM format! %d" % tsize
)
1125 cpdef int32_t tsize
( self ):
1126 """Get tag size from BAM file -- read l_seq field.
1128 Refer to: http://samtools.sourceforge.net/SAM1.pdf
1130 * This may not work for BAM file from bedToBAM (bedtools),
1131 since the l_seq field seems to be 0.
1135 int32_t x
, header_len
, nc
, nlength
1136 int32_t n
= 0 # successful read of tag size
1137 float64_t s
= 0 # sum of tag sizes
1139 if self.tag_size
!= -1:
1140 # if we have already calculated tag size (!= -1), return it.
1141 return self.tag_size
1143 fseek
= self.fhd
.seek
1144 fread
= self.fhd
.read
1145 ftell
= self.fhd
.tell
1146 # move to pos 4, there starts something
1148 header_len
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1149 fseek
( header_len
+ ftell
() )
1150 # get the number of chromosome
1151 nc
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1152 for x
in range( nc
):
1153 # read each chromosome name
1154 nlength
= unpack
( '<i' , fread
( 4 ) )[ 0 ]
1155 # jump over chromosome size, we don't need it
1157 fseek
( ftell
() + 4 )
1159 entrylength
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1160 data
= fread
( entrylength
)
1161 a
= unpack
( '<i', data
[16:20] )[ 0 ]
1165 self.tag_size
= <int32_t
>(s
/n
)
1166 return self.tag_size
1168 cpdef
tuple get_references
( self ):
1170 read in references from BAM header
1172 return a tuple (references (list of names),
1173 rlengths (dict of lengths)
1176 int32_t header_len
, x
, nc
, nlength
1178 list references
= []
1181 fseek
= self.fhd
.seek
1182 fread
= self.fhd
.read
1183 ftell
= self.fhd
.tell
1184 # move to pos 4, there starts something
1186 header_len
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1187 fseek
( header_len
+ ftell
() )
1188 # get the number of chromosome
1189 nc
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1190 for x
in range( nc
):
1191 # read each chromosome name
1192 nlength
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1193 refname
= fread
( nlength
)[ :-1 ]
1194 references
.append
( refname
)
1195 # don't jump over chromosome size
1196 # we can use it to avoid falling of chrom ends during peak calling
1197 rlengths
[refname
] = unpack
( '<i', fread
( 4 ) )[ 0 ]
1198 return (references
, rlengths
)
1200 cpdef build_fwtrack
( self ):
1201 """Build FWTrack from all lines, return a FWTrack object.
1203 Note only the unique match for a tag is kept.
1206 int64_t i
= 0 #number of reads kept
1207 int32_t entrylength
, fpos
, strand
, chrid
1211 fwtrack
= FWTrack
( buffer_size
= self.buffer_size
)
1212 references
, rlengths
= self.get_references
() # after this, ptr at list of alignments
1213 fseek
= self.fhd
.seek
1214 fread
= self.fhd
.read
1215 ftell
= self.fhd
.tell
1219 entrylength
= unpack
( "<i", fread
( 4 ) )[0]
1220 except struct.error
:
1222 ( chrid
, fpos
, strand
) = bam_se_entry_parser
( fread
( entrylength
) )
1223 if chrid
== -1: continue
1224 fwtrack
.add_loc
( references
[ chrid
], fpos
, strand
)
1226 if i
% 1000000 == 0:
1227 info
( " %d reads parsed" % i
)
1229 #print( f"{references[chrid]:},{fpos:},{strand:}" )
1230 info
( "%d reads have been read." % i
)
1232 fwtrack
.set_rlengths
( rlengths
)
1235 cpdef append_fwtrack
( self, fwtrack
):
1236 """Build FWTrack from all lines, return a FWTrack object.
1238 Note only the unique match for a tag is kept.
1241 int64_t i
= 0 #number of reads kept
1242 int32_t entrylength
, fpos
, strand
, chrid
1246 references
, rlengths
= self.get_references
()
1247 fseek
= self.fhd
.seek
1248 fread
= self.fhd
.read
1249 ftell
= self.fhd
.tell
1253 entrylength
= unpack
( '<i', fread
( 4 ) )[ 0 ]
1254 except struct.error
:
1256 ( chrid
, fpos
, strand
) = bam_se_entry_parser
( fread
( entrylength
) )
1257 if chrid
== -1: continue
1258 fwtrack
.add_loc
( references
[ chrid
], fpos
, strand
)
1260 if i
% 1000000 == 0:
1261 info
( " %d reads parsed" % i
)
1263 info
( "%d reads have been read." % i
)
1266 # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
1267 fwtrack
.set_rlengths
( rlengths
)
1270 cdef class BAMPEParser
(BAMParser
):
1271 """File Parser Class for BAM File containing paired-end reads
1272 Only counts valid pairs, discards everything else
1273 Uses the midpoint of every read and calculates the average fragment size
1274 on the fly instead of modeling it
1276 File is gzip-compatible and binary.
1277 Information available is the same that is in SAM format.
1279 The bitwise flag is made like this:
1286 16 strand of the query (1 -> reverse)
1287 32 strand of the mate
1288 64 first read in pair
1289 128 second read in pair
1290 256 alignment is not primary
1291 512 does not pass quality check
1292 1024 PCR or optical duplicate
1293 2048 supplementary alignment
1295 cdef public int32_t n
# total number of fragments
1296 cdef public float32_t d
# the average length of fragments
1298 cpdef
object build_petrack
( self ):
1299 """Build PETrackI from all lines, return a FWTrack object.
1302 int64_t i
# number of fragments kept
1303 int64_t m
# sum of fragment lengths
1304 int32_t entrylength
, fpos
, chrid
, tlen
1311 petrack
= PETrackI
( buffer_size
= self.buffer_size
)
1312 references
, rlengths
= self.get_references
()
1313 fseek
= self.fhd
.seek
1314 fread
= self.fhd
.read
1315 ftell
= self.fhd
.tell
1317 # for convenience, only count valid pairs
1318 add_loc
= petrack
.add_loc
1323 entrylength
= unpack
( '<i', fread
(4) )[0]
1327 ( chrid
, fpos
, tlen
) = bampe_pe_entry_parser
( fread
(entrylength
) )
1331 add_loc
(references
[ chrid
], fpos
, fpos
+ tlen
)
1334 if i
% 1000000 == 0:
1335 info
( " %d fragments parsed" % i
)
1337 #print( f"{references[chrid]:},{fpos:},{tlen:}" )
1338 info
( "%d fragments have been read." % i
)
1339 #debug( f" {e1} Can't identify the length of entry, it may be the end of file, stop looping..." )
1340 #debug( f" {e2} Chromosome name can't be found which means this entry is skipped ..." )
1341 #assert i > 0, "Something went wrong, no fragment has been read! Check input file!"
1344 #assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
1346 petrack
.set_rlengths
( rlengths
)
1349 cpdef append_petrack
(self, petrack
):
1350 """Build PETrackI from all lines, return a PETrackI object.
1353 int64_t i
= 0 # number of fragments kept
1354 int64_t m
= 0 # sum of fragment lengths
1355 int32_t entrylength
, fpos
, chrid
, tlen
1359 references
, rlengths
= self.get_references
()
1360 fseek
= self.fhd
.seek
1361 fread
= self.fhd
.read
1362 ftell
= self.fhd
.tell
1364 # for convenience, only count valid pairs
1365 add_loc
= petrack
.add_loc
1370 entrylength
= unpack
('<i', fread
(4))[0]
1373 ( chrid
, fpos
, tlen
) = bampe_pe_entry_parser
( fread
(entrylength
) )
1376 add_loc
(references
[ chrid
], fpos
, fpos
+ tlen
)
1379 if i
% 1000000 == 0:
1380 info
(" %d fragments parsed" % i
)
1382 info
( "%d fragments have been read." % i
)
1383 self.d
= ( self.d
* self.n
+ m
) / ( self.n
+ i
)
1385 #assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
1387 # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
1388 # petrack.finalize()
1389 petrack
.set_rlengths
( rlengths
)
1392 cdef class BowtieParser
( GenericParser
):
1393 """File Parser Class for map files from Bowtie or MAQ's maqview
1397 cdef int32_t __tlen_parse_line
( self, bytes thisline
):
1398 """Parse tag sequence, then tag length.
1401 cdef list thisfields
1403 thisline
= thisline
.rstrip
()
1404 if not thisline
: return ( b
"", -1, -1 )
1405 if thisline
[ 0 ] == b
"#": return ( b
"", -1 , -1 ) # comment line is skipped
1406 thisfields
= thisline
.split
( b
'\t' ) # I hope it will never bring me more trouble
1407 return len( thisfields
[ 4 ] )
1409 cdef tuple __fw_parse_line
(self, bytes thisline
):
1411 The following definition comes from bowtie website:
1413 The bowtie aligner outputs each alignment on a separate
1414 line. Each line is a collection of 8 fields separated by tabs;
1415 from left to right, the fields are:
1417 1. Name of read that aligned
1419 2. Orientation of read in the alignment, - for reverse
1420 complement, + otherwise
1422 3. Name of reference sequence where alignment occurs, or
1423 ordinal ID if no name was provided
1425 4. 0-based offset into the forward reference strand where
1426 leftmost character of the alignment occurs
1428 5. Read sequence (reverse-complemented if orientation is -)
1430 6. ASCII-encoded read qualities (reversed if orientation is
1431 -). The encoded quality values are on the Phred scale and the
1432 encoding is ASCII-offset by 33 (ASCII char !).
1434 7. Number of other instances where the same read aligns
1435 against the same reference characters as were aligned against
1436 in this alignment. This is not the number of other places the
1437 read aligns with the same number of mismatches. The number in
1438 this column is generally not a good proxy for that number
1439 (e.g., the number in this column may be '0' while the number
1440 of other alignments with the same number of mismatches might
1441 be large). This column was previously described as 'Reserved'.
1443 8. Comma-separated list of mismatch descriptors. If there are
1444 no mismatches in the alignment, this field is empty. A single
1445 descriptor has the format offset:reference-base>read-base. The
1446 offset is expressed as a 0-based offset from the high-quality
1447 (5') end of the read.
1454 thisline
= thisline
.rstrip
()
1455 if not thisline
: return ( b
"", -1, -1 )
1456 if thisline
[ 0 ] == b
"#": return ( b
"", -1, -1 ) # comment line is skipped
1457 thisfields
= thisline
.split
( b
'\t' ) # I hope it will never bring me more trouble
1459 chromname
= thisfields
[ 2 ]
1461 chromname
= chromname
[ :chromname
.rindex
( b
".fa" ) ]
1465 if thisfields
[ 1 ] == b
"+":
1467 atoi
( thisfields
[ 3 ] ),
1469 elif thisfields
[ 1 ] == b
"-":
1471 atoi
( thisfields
[ 3 ] ) + strlen
( thisfields
[ 4 ] ),
1474 raise StrandFormatError
( thisline
, thisfields
[ 1 ] )