simplify output msgs of callvar
[MACS.git] / MACS3 / IO / Parser.pyx
blob62c3f846b6b80f0900d2933f6709fd4b44909df2
1 # cython: language_level=3
2 # cython: profile=True
3 # cython: linetrace=True
4 # Time-stamp: <2020-12-06 23:32:28 Tao Liu>
6 """Module for all MACS Parser classes for input.
8 This code is free software; you can redistribute it and/or modify it
9 under the terms of the BSD License (see the file LICENSE included with
10 the distribution).
11 """
13 # ------------------------------------
14 # python modules
15 # ------------------------------------
16 import logging
17 from logging import info, debug
18 import struct
19 from struct import unpack
20 from re import findall
21 import gzip
22 import io
23 import sys
25 # ------------------------------------
26 # MACS3 modules
27 # ------------------------------------
29 from MACS3.Utilities.Constants import *
30 from MACS3.Signal.FixWidthTrack import FWTrack
31 from MACS3.Signal.PairedEndTrack import PETrackI
33 # ------------------------------------
34 # Other modules
35 # ------------------------------------
36 from cpython cimport bool
38 import numpy as np
39 cimport numpy as np
40 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
42 is_le = sys.byteorder == "little"
44 cdef extern from "stdlib.h":
45 ctypedef uint32_t size_t
46 size_t strlen(char *s)
47 void *malloc(size_t size)
48 void *calloc(size_t n, size_t size)
49 void free(void *ptr)
50 int32_t strcmp(char *a, char *b)
51 char * strcpy(char *a, char *b)
52 int64_t atol(char *str)
53 int32_t atoi(char *str)
55 # ------------------------------------
56 # constants
57 # ------------------------------------
58 __version__ = "Parser $Revision$"
59 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
60 __doc__ = "All Parser classes"
62 # ------------------------------------
63 # Misc functions
64 # ------------------------------------
66 cpdef guess_parser ( fname, int64_t buffer_size = 100000 ):
67 # Note: BAMPE and BEDPE can't be automatically detected.
68 ordered_parser_dict = {"BAM":BAMParser,
69 "BED":BEDParser,
70 "ELAND":ELANDResultParser,
71 "ELANDMULTI":ELANDMultiParser,
72 "ELANDEXPORT":ELANDExportParser,
73 "SAM":SAMParser,
74 "BOWTIE":BowtieParser}
76 for f in ordered_parser_dict:
77 p = ordered_parser_dict[ f ]
78 t_parser = p( fname, buffer_size = buffer_size )
79 debug( "Testing format %s" % f )
80 s = t_parser.sniff()
81 if s:
82 info( "Detected format is: %s" % ( f ) )
83 if t_parser.is_gzipped():
84 info( "* Input file is gzipped." )
85 return t_parser
86 else:
87 t_parser.close()
88 raise Exception( "Can't detect format!" )
90 cdef tuple __bam_fw_binary_parse_le ( const unsigned char * data ):
91 """Parse a BAM SE entry in little endian system
92 """
93 cdef:
94 int32_t thisref, thisstart, thisstrand
95 uint16_t bwflag
96 uint8_t l_read_name
97 uint16_t n_cigar_op
98 int32_t cigar_code
99 uint8_t *ui8
100 int32_t *i32
101 uint16_t *ui16
102 uint32_t *ui32
104 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
105 # no data, return, does it really happen without raising struct.error?
106 if not data: return ( -1, -1, -1 )
108 ui16 = <uint16_t *>data
109 bwflag = ui16[7]
111 # we filter out unmapped sequence or bad sequence or secondary or supplementary alignment
112 # we filter out 2nd mate, not a proper pair, mate is unmapped
113 if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)): return ( -1, -1, -1 )
115 i32 = <int32_t *>data
116 thisref = i32[0]
117 thisstart = i32[1]
118 n_cigar_op = ui16[6]
119 # In case of paired-end we have now skipped all possible "bad" pairs
120 # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
121 # we can treat it as a single read, so just check the strand and calculate its
122 # start position... hope I'm right!
123 if bwflag & 16:
124 # read mapped to minus strand; then we have to compute cigar to find the rightmost position
125 ui8 = <uint8_t *>data
126 l_read_name = ui8[8]
127 # need to decipher CIGAR string
128 ui32 = <uint32_t *>(data + 32 + l_read_name) # move pointer to cigar_code
129 for cigar_code in ui32[:n_cigar_op]:#unpack( '<%dI' % (n_cigar_op) , data[ 32 + l_read_name : 32 + l_read_name + n_cigar_op*4 ] ):
130 if cigar_code & 15 in [ 0, 2, 3, 7, 8 ]: # they are CIGAR op M/D/N/=/X
131 thisstart += cigar_code >> 4
132 thisstrand = 1
133 else:
134 thisstrand = 0
136 return ( thisref, thisstart, thisstrand )
138 cdef tuple __bam_fw_binary_parse_be ( const unsigned char * data ):
139 """Big endian version. We need byte swap.
141 cdef:
142 int32_t thisref, thisstart, thisstrand
143 uint16_t bwflag
144 uint8_t l_read_name
145 uint16_t n_cigar_op
146 int32_t cigar_code
147 uint8_t *ui8 # we will only cast 1 byte at a time
148 int32_t i
149 uint32_t shift0, shift
152 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
153 # no data, return, does it really happen without raising struct.error?
154 if not data: return ( -1, -1, -1 )
156 ui8 = <uint8_t *>data
157 bwflag = ui8[15] << 8 | ui8[14] # it works as bwflag = ui16[7] in little-endian
159 # we filter out unmapped sequence or bad sequence or secondary or supplementary alignment
160 # we filter out 2nd mate, not a proper pair, mate is unmapped
161 if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)): return ( -1, -1, -1 )
163 # the following three lins are for little-endian
164 #thisref = i32[0]
165 #thisstart = i32[1]
166 #n_cigar_op = ui16[6]
168 # to simplify the byte swap, we pretend all original numbers (thisref, pos, nextpos) positive
169 thisref = ui8[3] << 24 | ui8[2] << 16 | ui8[1] << 8 | ui8[0]
170 thisstart = ui8[7] << 24 | ui8[6] << 16 | ui8[5] << 8 | ui8[4]
171 n_cigar_op = ui8[13] << 8 | i8[12]
173 # In case of paired-end we have now skipped all possible "bad" pairs
174 # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
175 # we can treat it as a single read, so just check the strand and calculate its
176 # start position... hope I'm right!
177 if bwflag & 16:
178 # read mapped to minus strand; then we have to compute cigar to find the rightmost position
179 l_read_name = ui8[8]
180 # need to decipher CIGAR string
181 # move pointer to cigar_code
182 shift0 = 32 + l_read_name
183 for i in range(n_cigar_op):
184 shift = shift0 + i*4 # move 32bit at a time
185 cigar_code = ui8[shift0+3] << 24 | ui8[shift0+2] << 16 | ui8[shift0+1] << 8 | ui8[shift0] # it works like cigar_code = ui32[...] in little-endian
186 if cigar_code & 15 in [ 0, 2, 3, 7, 8 ]: # they are CIGAR op M/D/N/=/X
187 thisstart += cigar_code >> 4
188 thisstrand = 1
189 else:
190 thisstrand = 0
192 return ( thisref, thisstart, thisstrand )
194 cdef tuple __bampe_pe_binary_parse_le (const unsigned char * data):
195 """Parse a BAMPE record in little-endian system.
197 cdef:
198 int32_t thisref, thisstart, thistlen
199 int32_t nextpos, pos
200 uint16_t bwflag
201 uint8_t *ui8
202 int32_t *i32
203 uint16_t *ui16
205 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
206 if not data:
207 return ( -1, -1, -1 )
209 ui16 = <uint16_t *>data
210 bwflag = ui16[7]
211 # we filter out unmapped, bad sequence, secondary/supplementary alignment
212 # we filter out other mate of paired reads, not a proper pair, or mate is unmapped
213 if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)):
214 return ( -1, -1, -1 )
216 i32 = <int32_t *>data
217 ui8 = <uint8_t *>data
218 thisref = i32[0]
220 pos = i32[1]
221 nextpos = i32[6]
222 thistlen = i32[7]
223 thisstart = min(pos, nextpos) # we keep only the leftmost
224 # position which means this must
225 # be at + strand. So we don't
226 # need to decipher CIGAR string.
227 thistlen = abs( thistlen ) # Actually, if
228 # # the value
229 # # unpacked is
230 # # negative, then
231 # # nextpos is the
232 # # leftmost
233 # # position.
235 return ( thisref, thisstart, thistlen )
237 cdef tuple __bampe_pe_binary_parse_be (const unsigned char * data):
238 """Parse a BAMPE record in big-endian system. And we need byte swap.
240 cdef:
241 int32_t thisref, thisstart, thistlen
242 uint32_t tmp_thistlen
243 int32_t nextpos, pos
244 uint16_t bwflag
245 uint8_t *ui8 # we will only cast 1 byte at a time
247 # we skip lot of the available information in data (i.e. tag name, quality etc etc)
248 if not data:
249 return ( -1, -1, -1 )
251 ui8 = <uint8_t *>data
253 bwflag = ui8[15] << 8 | ui8[14] # as le: bwflag = ui16[7]
254 # we filter out unmapped, bad sequence, secondary/supplementary alignment
255 # we filter out other mate of paired reads, not a proper pair, or mate is unmapped
256 if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)):
257 return ( -1, -1, -1 )
259 i8 = <int8_t *>data
261 # the following three lins are for little-endian
265 # to simplify the byte swap, we pretend all original numbers (thisref, pos, nextpos) positive
266 thisref = ui8[3] << 24 | ui8[2] << 16 | ui8[1] << 8 | ui8[0] # as le:thisref = i32[0]
267 pos = ui8[7] << 24 | ui8[6] << 16 | ui8[5] << 8 | ui8[4] # as le:pos = i32[1]
268 nextpos = ui8[27] << 24 | ui8[26] << 16 | ui8[25] << 8 | ui8[24] # as le:nextpos = i32[6]
270 # thistlen can be negative, so we byte swap it then convert to int32_t then take abs (maybe there is more effecient way?)
271 tmp_thistlen = ui8[31] << 24 | ui8[30] << 16 | ui8[29] << 8 | ui8[28] # as le:tmp_thistlen = ui32[7]
272 thistlen = abs(<int32_t> tmp_thistlen)
274 # position which means this must
275 # be at + strand. So we don't
276 # need to decipher CIGAR string.
277 thisstart = pos if nextpos > pos else nextpos #min(pos, nextpos) # we keep only the leftmost
279 return ( thisref, thisstart, thistlen )
282 # choose a parser according to endian
283 if is_le:
284 bam_se_entry_parser = __bam_fw_binary_parse_le
285 bampe_pe_entry_parser = __bampe_pe_binary_parse_le
286 else:
287 bam_se_entry_parser = __bam_fw_binary_parse_be
288 bampe_pe_entry_parser = __bampe_pe_binary_parse_be
290 # ------------------------------------
291 # Classes
292 # ------------------------------------
293 class StrandFormatError( Exception ):
294 """Exception about strand format error.
296 Example:
297 raise StrandFormatError('Must be F or R','X')
299 def __init__ ( self, string, strand ):
300 self.strand = strand
301 self.string = string
303 def __str__ ( self ):
304 return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % ( self.string, self.strand ) )
306 cdef class GenericParser:
307 """Generic Parser class.
309 Inherit this class to write your own parser. In most cases, you need to override:
311 1. __tlen_parse_line which returns tag length of a line
312 2. __fw_parse_line which returns tuple of ( chromosome, 5'position, strand )
314 cdef:
315 str filename
316 bool gzipped
317 int32_t tag_size
318 object fhd
319 int64_t buffer_size
321 def __init__ ( self, str filename, int64_t buffer_size = 100000 ):
322 """Open input file. Determine whether it's a gzipped file.
324 'filename' must be a string object.
326 This function initialize the following attributes:
328 1. self.filename: the filename for input file.
329 2. self.gzipped: a boolean indicating whether input file is gzipped.
330 3. self.fhd: buffered I/O stream of input file
332 self.filename = filename
333 self.gzipped = True
334 self.tag_size = -1
335 self.buffer_size = buffer_size
336 # try gzip first
337 f = gzip.open( filename )
338 try:
339 f.read( 10 )
340 except IOError:
341 # not a gzipped file
342 self.gzipped = False
343 f.close()
344 if self.gzipped:
345 # open with gzip.open, then wrap it with BufferedReader!
346 self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE ) # buffersize set to 10M
347 else:
348 self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!
349 self.__skip_first_commentlines()
351 cdef void __skip_first_commentlines ( self ):
352 """Some parser needs to skip the first several comment lines.
354 Redefine this if it's necessary!
356 return
358 cpdef int32_t tsize( self ):
359 """General function to detect tag size.
361 * Although it can be used by most parsers, it must be
362 rewritten by BAMParser!
364 cdef:
365 int32_t s = 0
366 int32_t n = 0 # number of successful/valid read alignments
367 int32_t m = 0 # number of trials
368 int32_t this_taglength
369 bytes thisline
371 if self.tag_size != -1:
372 # if we have already calculated tag size (!= -1), return it.
373 return self.tag_size
375 # try 10k times or retrieve 10 successfule alignments
376 while n < 10 and m < 10000:
377 m += 1
378 thisline = self.fhd.readline()
379 this_taglength = self.__tlen_parse_line( thisline )
380 if this_taglength > 0:
381 # this_taglength == 0 means this line doesn't contain
382 # successful alignment.
383 s += this_taglength
384 n += 1
385 # done
386 self.fhd.seek( 0 )
387 self.__skip_first_commentlines()
388 if n != 0: # else tsize = -1
389 self.tag_size = <int32_t>(s/n)
390 return self.tag_size
392 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
393 """Abstract function to detect tag length.
396 raise NotImplemented
398 cpdef build_fwtrack ( self ):
399 """Generic function to build FWTrack object. Create a new
400 FWTrack object. If you want to append new records to an
401 existing FWTrack object, try append_fwtrack function.
403 * BAMParser for binary BAM format should have a different one.
405 cdef:
406 int64_t i, fpos, strand
407 bytes chromosome
408 bytes tmp = b""
410 fwtrack = FWTrack( buffer_size = self.buffer_size )
411 i = 0
412 while True:
413 # for each block of input
414 tmp += self.fhd.read( READ_BUFFER_SIZE )
415 if not tmp:
416 break
417 lines = tmp.split(b"\n")
418 tmp = lines[ -1 ]
419 for thisline in lines[ :-1 ]:
420 ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline )
421 if fpos < 0 or not chromosome:
422 # normally __fw_parse_line will return -1 if the line
423 # contains no successful alignment.
424 continue
425 i += 1
426 if i % 1000000 == 0:
427 info( " %d reads parsed" % i )
428 fwtrack.add_loc( chromosome, fpos, strand )
429 # last one
430 if tmp:
431 ( chromosome, fpos, strand ) = self.__fw_parse_line( tmp )
432 if fpos >= 0 and chromosome:
433 i += 1
434 fwtrack.add_loc( chromosome, fpos, strand )
435 # close file stream.
436 self.close()
437 return fwtrack
439 cpdef append_fwtrack ( self, fwtrack ):
440 """Add more records to an existing FWTrack object.
443 cdef:
444 int64_t i, fpos, strand
445 bytes chromosome
446 bytes tmp = b""
447 i = 0
448 while True:
449 # for each block of input
450 tmp += self.fhd.read( READ_BUFFER_SIZE )
451 if not tmp:
452 break
453 lines = tmp.split(b"\n")
454 tmp = lines[ -1 ]
455 for thisline in lines[ :-1 ]:
456 ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline )
457 if fpos < 0 or not chromosome:
458 # normally __fw_parse_line will return -1 if the line
459 # contains no successful alignment.
460 continue
461 i += 1
462 if i % 1000000 == 0:
463 info( " %d reads parsed" % i )
464 fwtrack.add_loc( chromosome, fpos, strand )
466 # last one
467 if tmp:
468 ( chromosome, fpos, strand ) = self.__fw_parse_line( tmp )
469 if fpos >= 0 and chromosome:
470 i += 1
471 fwtrack.add_loc( chromosome, fpos, strand )
472 # close file stream.
473 self.close()
474 return fwtrack
476 cdef tuple __fw_parse_line ( self, bytes thisline ):
477 """Abstract function to parse chromosome, 5' end position and
478 strand.
481 cdef bytes chromosome = b""
482 cdef int32_t fpos = -1
483 cdef int32_t strand = -1
484 return ( chromosome, fpos, strand )
486 cpdef sniff ( self ):
487 """Detect whether this parser is the correct parser for input
488 file.
490 Rule: try to find the tag size using this parser, if error
491 occurs or tag size is too small or too big, check is failed.
493 * BAMParser has a different sniff function.
495 cdef int32_t t
497 t = self.tsize()
498 if t <= 10 or t >= 10000: # tsize too small or too big
499 self.fhd.seek( 0 )
500 return False
501 else:
502 self.fhd.seek( 0 )
503 self.__skip_first_commentlines()
504 return True
506 cpdef close ( self ):
507 """Run this when this Parser will be never used.
509 Close file I/O stream.
511 self.fhd.close()
513 cpdef bool is_gzipped ( self ):
514 return self.gzipped
516 cdef class BEDParser( GenericParser ):
517 """File Parser Class for BED File.
520 cdef void __skip_first_commentlines ( self ):
521 """BEDParser needs to skip the first several comment lines.
523 cdef:
524 int32_t l_line
525 bytes this_line
526 for thisline in self.fhd:
527 l_line = len( thisline )
528 if thisline and ( thisline[ :5 ] != b"track" ) \
529 and ( thisline[ :7 ] != b"browser" ) \
530 and ( thisline[ 0 ] != 35 ): # 35 is b"#"
531 break
533 # rewind from SEEK_CUR
534 self.fhd.seek( -l_line, 1 )
535 return
537 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
538 """Parse 5' and 3' position, then calculate frag length.
541 thisline = thisline.rstrip()
542 if not thisline:
543 return 0
545 thisfields = thisline.split( b'\t' )
546 return atoi( thisfields[ 2 ] )-atoi( thisfields[ 1 ] )
548 cdef tuple __fw_parse_line ( self, bytes thisline ):
549 #cdef list thisfields
550 cdef:
551 bytes chromname
552 list thisfields
554 thisline = thisline.rstrip()
555 thisfields = thisline.split( b'\t' )
556 chromname = thisfields[ 0 ]
557 try:
558 #if not strcmp(thisfields[ 5 ],b"+"):
559 if thisfields[5] == b"+":
560 return ( chromname,
561 atoi( thisfields[ 1 ] ),
563 #elif not strcmp(thisfields[ 5 ], b"-"):
564 elif thisfields[5] == b"-":
565 return ( chromname,
566 atoi( thisfields[ 2 ] ),
568 else:
569 raise StrandFormatError( thisline, thisfields[ 5 ] )
570 except IndexError:
571 # default pos strand if no strand
572 # info can be found
573 return ( chromname,
574 atoi( thisfields[ 1 ] ),
577 cdef class BEDPEParser(GenericParser):
578 """Parser for BED format file containing PE information, and also
579 can be used for the cases when users predefine the fragment
580 locations by shifting/extending by themselves.
582 Format:
584 chromosome_name frag_leftend frag_rightend
587 Note: Only the first columns are used!
590 cdef public int32_t n
591 cdef public float32_t d
593 cdef void __skip_first_commentlines ( self ):
594 """BEDPEParser needs to skip the first several comment lines.
596 cdef:
597 int32_t l_line
598 bytes this_line
599 for thisline in self.fhd:
600 l_line = len( thisline )
601 if thisline and ( thisline[ :5 ] != b"track" ) \
602 and ( thisline[ :7 ] != b"browser" ) \
603 and ( thisline[ 0 ] != 35 ): # 35 is b"#"
604 break
606 # rewind from SEEK_CUR
607 self.fhd.seek( -l_line, 1 )
608 return
610 cdef __pe_parse_line ( self, bytes thisline ):
611 """ Parse each line, and return chromosome, left and right positions
614 cdef:
615 list thisfields
617 thisline = thisline.rstrip()
619 # still only support tabular as delimiter.
620 thisfields = thisline.split( b'\t' )
621 try:
622 return ( thisfields[ 0 ],
623 atoi( thisfields[ 1 ] ),
624 atoi( thisfields[ 2 ] ) )
625 except IndexError:
626 raise Exception("Less than 3 columns found at this line: %s\n" % thisline)
628 cpdef build_petrack ( self ):
629 """Build PETrackI from all lines.
632 cdef:
633 bytes chromname
634 int32_t left_pos
635 int32_t right_pos
636 int64_t i = 0 # number of fragments
637 int64_t m = 0 # sum of fragment lengths
638 bytes tmp = b""
640 petrack = PETrackI( buffer_size = self.buffer_size )
641 add_loc = petrack.add_loc
643 while True:
644 # for each block of input
645 tmp += self.fhd.read( READ_BUFFER_SIZE )
646 if not tmp:
647 break
648 lines = tmp.split(b"\n")
649 tmp = lines[ -1 ]
650 for thisline in lines[ :-1 ]:
651 ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
652 if left_pos < 0 or not chromosome:
653 continue
654 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
655 m += right_pos - left_pos
656 i += 1
657 if i % 1000000 == 0:
658 info( " %d fragments parsed" % i )
659 add_loc( chromosome, left_pos, right_pos )
660 # last one
661 if tmp:
662 ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
663 if left_pos >= 0 and chromosome:
664 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
665 i += 1
666 m += right_pos - left_pos
667 add_loc( chromosome, left_pos, right_pos )
669 self.d = <float32_t>( m ) / i
670 self.n = i
671 assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
673 self.close()
674 petrack.set_rlengths( {"DUMMYCHROM":0} )
675 return petrack
677 cpdef append_petrack (self, petrack):
678 """Build PETrackI from all lines, return a PETrackI object.
680 cdef:
681 bytes chromname
682 int32_t left_pos
683 int32_t right_pos
684 int64_t i = 0 # number of fragments
685 int64_t m = 0 # sum of fragment lengths
686 bytes tmp = b""
688 add_loc = petrack.add_loc
689 while True:
690 # for each block of input
691 tmp += self.fhd.read( READ_BUFFER_SIZE )
692 if not tmp:
693 break
694 lines = tmp.split(b"\n")
695 tmp = lines[ -1 ]
696 for thisline in lines[ :-1 ]:
697 ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
698 if left_pos < 0 or not chromosome:
699 continue
700 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
701 m += right_pos - left_pos
702 i += 1
703 if i % 1000000 == 0:
704 info( " %d fragments parsed" % i )
705 add_loc( chromosome, left_pos, right_pos )
706 # last one
707 if tmp:
708 ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
709 if left_pos >= 0 and chromosome:
710 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
711 i += 1
712 m += right_pos - left_pos
713 add_loc( chromosome, left_pos, right_pos )
715 self.d = ( self.d * self.n + m ) / ( self.n + i )
716 self.n += i
718 assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
719 self.close()
720 petrack.set_rlengths( {"DUMMYCHROM":0} )
721 return petrack
723 cdef class ELANDResultParser( GenericParser ):
724 """File Parser Class for tabular File.
727 cdef void __skip_first_commentlines ( self ):
728 """ELANDResultParser needs to skip the first several comment lines.
730 cdef:
731 int32_t l_line
732 bytes this_line
733 for thisline in self.fhd:
734 l_line = len( thisline )
735 if thisline and thisline[ 0 ] != 35: # 35 is b"#"
736 break
738 # rewind from SEEK_CUR
739 self.fhd.seek( -l_line, 1 )
740 return
742 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
743 """Parse tag sequence, then tag length.
746 thisline = thisline.rstrip()
747 if not thisline: return 0
748 thisfields = thisline.split( b'\t' )
749 if thisfields[1].isdigit():
750 return 0
751 else:
752 return len( thisfields[ 1 ] )
754 cdef tuple __fw_parse_line ( self, bytes thisline ):
755 cdef:
756 bytes chromname, strand
757 int32_t thistaglength
758 thisline = thisline.rstrip()
759 if not thisline: return ( b"", -1, -1 )
761 thisfields = thisline.split( b'\t' )
762 thistaglength = strlen( thisfields[ 1 ] )
764 if len( thisfields ) <= 6:
765 return ( b"", -1, -1 )
767 try:
768 chromname = thisfields[ 6 ]
769 chromname = chromname[ :chromname.rindex( b".fa" ) ]
770 except ValueError:
771 pass
773 if thisfields[ 2 ] == b"U0" or thisfields[ 2 ] == b"U1" or thisfields[ 2 ] == b"U2":
774 # allow up to 2 mismatches...
775 strand = thisfields[ 8 ]
776 if strand == b"F":
777 return ( chromname,
778 atoi( thisfields[ 7 ] ) - 1,
780 elif strand == b"R":
781 return ( chromname,
782 atoi( thisfields[ 7 ] ) + thistaglength - 1,
784 else:
785 raise StrandFormatError( thisline, strand )
786 else:
787 return ( b"", -1, -1 )
789 cdef class ELANDMultiParser( GenericParser ):
790 """File Parser Class for ELAND multi File.
792 Note this parser can only work for s_N_eland_multi.txt format.
794 Each line of the output file contains the following fields:
795 1. Sequence name
796 2. Sequence
797 3. Either NM, QC, RM (as described above) or the following:
798 4. x:y:z where x, y, and z are the number of exact, single-error, and 2-error matches
799 found
800 5. Blank, if no matches found or if too many matches found, or the following:
801 BAC_plus_vector.fa:163022R1,170128F2,E_coli.fa:3909847R1
802 This says there are two matches to BAC_plus_vector.fa: one in the reverse direction
803 starting at position 160322 with one error, one in the forward direction starting at
804 position 170128 with two errors. There is also a single-error match to E_coli.fa.
806 cdef void __skip_first_commentlines ( self ):
807 """ELANDResultParser needs to skip the first several comment lines.
809 cdef:
810 int32_t l_line
811 bytes this_line
812 for thisline in self.fhd:
813 l_line = len( thisline )
814 if thisline and thisline[ 0 ] != 35: # 35 is b"#"
815 break
817 # rewind from SEEK_CUR
818 self.fhd.seek( -l_line, 1 )
819 return
821 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
822 """Parse tag sequence, then tag length.
825 thisline = thisline.rstrip()
826 if not thisline: return 0
827 thisfields = thisline.split( b'\t' )
828 if thisfields[1].isdigit():
829 return 0
830 else:
831 return len( thisfields[ 1 ] )
833 cdef tuple __fw_parse_line ( self, bytes thisline ):
834 cdef:
835 list thisfields
836 bytes thistagname, pos, strand
837 int32_t thistaglength, thistaghits
839 if not thisline: return ( b"", -1, -1 )
840 thisline = thisline.rstrip()
841 if not thisline: return ( b"", -1, -1 )
843 thisfields = thisline.split( b'\t' )
844 thistagname = thisfields[ 0 ] # name of tag
845 thistaglength = len( thisfields[ 1 ] ) # length of tag
847 if len( thisfields ) < 4:
848 return ( b"", -1, -1 )
849 else:
850 thistaghits = sum( [<int32_t>(x) for x in thisfields[ 2 ].split( b':' ) ] )
851 if thistaghits > 1:
852 # multiple hits
853 return ( b"", -1, -1 )
854 else:
855 ( chromname, pos ) = thisfields[ 3 ].split( b':' )
857 try:
858 chromname = chromname[ :chromname.rindex( b".fa" ) ]
859 except ValueError:
860 pass
862 strand = pos[ -2 ]
863 if strand == b"F":
864 return ( chromname,
865 <int32_t>( pos[ :-2 ] )-1,
867 elif strand == b"R":
868 return ( chromname,
869 <int32_t>( pos[ :-2 ] ) + thistaglength - 1,
871 else:
872 raise StrandFormatError( thisline,strand )
875 cdef class ELANDExportParser( GenericParser ):
876 """File Parser Class for ELAND Export File.
879 cdef void __skip_first_commentlines ( self ):
880 """ELANDResultParser needs to skip the first several comment lines.
882 cdef:
883 int32_t l_line
884 bytes this_line
885 for thisline in self.fhd:
886 l_line = len( thisline )
887 if thisline and thisline[ 0 ] != 35: # 35 is b"#"
888 break
890 # rewind from SEEK_CUR
891 self.fhd.seek( -l_line, 1 )
892 return
894 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
895 """Parse tag sequence, then tag length.
898 thisline = thisline.rstrip()
899 if not thisline: return 0
900 thisfields = thisline.split( b'\t' )
901 if len( thisfields ) > 12 and thisfields[ 12 ]:
902 # a successful alignment has over 12 columns
903 return len( thisfields[ 8 ] )
904 else:
905 return 0
907 cdef tuple __fw_parse_line ( self, bytes thisline ):
908 cdef:
909 list thisfields
910 bytes thisname, strand
911 int32_t thistaglength
913 thisline = thisline.rstrip()
914 if not thisline: return ( b"", -1, -1 )
916 thisfields = thisline.split( b"\t" )
918 if len(thisfields) > 12 and thisfields[ 12 ]:
919 thisname = b":".join( thisfields[ 0:6 ] )
920 thistaglength = len( thisfields[ 8 ] )
921 strand = thisfields[ 13 ]
922 if strand == b"F":
923 return ( thisfields[ 10 ], atoi( thisfields[ 12 ] ) - 1, 0 )
924 elif strand == b"R":
925 return ( thisfields[ 10 ], atoi( thisfields[ 12 ] ) + thistaglength - 1, 1 )
926 else:
927 raise StrandFormatError( thisline, strand )
928 else:
929 return ( b"", -1, -1 )
931 ### Contributed by Davide, modified by Tao
932 cdef class SAMParser( GenericParser ):
933 """File Parser Class for SAM File.
935 Each line of the output file contains at least:
936 1. Sequence name
937 2. Bitwise flag
938 3. Reference name
939 4. 1-based leftmost position fo clipped alignment
940 5. Mapping quality
941 6. CIGAR string
942 7. Mate Reference Name
943 8. 1-based leftmost Mate Position
944 9. Inferred insert size
945 10. Query sequence on the same strand as the reference
946 11. Query quality
948 The bitwise flag is made like this:
949 dec meaning
950 --- -------
951 1 paired read
952 2 proper pair
953 4 query unmapped
954 8 mate unmapped
955 16 strand of the query (1 -> reverse)
956 32 strand of the mate
957 64 first read in pair
958 128 second read in pair
959 256 alignment is not primary
960 512 does not pass quality check
961 1024 PCR or optical duplicate
962 2048 supplementary alignment
965 cdef void __skip_first_commentlines ( self ):
966 """SAMParser needs to skip the first several comment lines.
968 cdef:
969 int32_t l_line
970 bytes this_line
971 for thisline in self.fhd:
972 l_line = len( thisline )
973 if thisline and thisline[ 0 ] != 64: # 64 is b"@"
974 break
976 # rewind from SEEK_CUR
977 self.fhd.seek( -l_line, 1 )
978 return
980 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
981 """Parse tag sequence, then tag length.
984 cdef:
985 list thisfields
986 int32_t bwflag
988 thisline = thisline.rstrip()
989 if not thisline: return 0
990 thisfields = thisline.split( b'\t' )
991 bwflag = atoi( thisfields[ 1 ] )
992 if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
993 return 0 #unmapped sequence or bad sequence or 2nd or sup alignment
994 if bwflag & 1:
995 # paired read. We should only keep sequence if the mate is mapped
996 # and if this is the left mate, all is within the flag!
997 if not bwflag & 2:
998 return 0 # not a proper pair
999 if bwflag & 8:
1000 return 0 # the mate is unmapped
1001 # From Benjamin Schiller https://github.com/benjschiller
1002 if bwflag & 128:
1003 # this is not the first read in a pair
1004 return 0
1005 return len( thisfields[ 9 ] )
1007 cdef tuple __fw_parse_line ( self, bytes thisline ):
1008 cdef:
1009 list thisfields
1010 bytes thistagname, thisref
1011 int32_t bwflag, thisstrand, thisstart
1013 thisline = thisline.rstrip()
1014 if not thisline: return ( b"", -1, -1 )
1015 thisfields = thisline.split( b'\t' )
1016 thistagname = thisfields[ 0 ] # name of tag
1017 thisref = thisfields[ 2 ]
1018 bwflag = atoi( thisfields[ 1 ] )
1019 CIGAR = thisfields[ 5 ]
1021 if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)): return ( b"", -1, -1 )
1023 #if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
1024 # return ( b"", -1, -1 ) #unmapped sequence or bad sequence or 2nd or sup alignment
1025 #if bwflag & 1:
1026 # # paired read. We should only keep sequence if the mate is mapped
1027 # # and if this is the left mate, all is within the flag!
1028 # if not bwflag & 2:
1029 # return ( b"", -1, -1 ) # not a proper pair
1030 # if bwflag & 8:
1031 # return ( b"", -1, -1 ) # the mate is unmapped
1032 # # From Benjamin Schiller https://github.com/benjschiller
1033 # if bwflag & 128:
1034 # # this is not the first read in a pair
1035 # return ( b"", -1, -1 )
1036 # # end of the patch
1037 # In case of paired-end we have now skipped all possible "bad" pairs
1038 # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
1039 # we can treat it as a single read, so just check the strand and calculate its
1040 # start position... hope I'm right!
1041 if bwflag & 16:
1042 # minus strand, we have to decipher CIGAR string
1043 thisstrand = 1
1044 thisstart = atoi( thisfields[ 3 ] ) - 1 + sum( [ <int32_t>(x) for x in findall(b"(\d+)[MDNX=]",CIGAR) ] ) #reverse strand should be shifted alen bp
1045 else:
1046 thisstrand = 0
1047 thisstart = atoi( thisfields[ 3 ] ) - 1
1049 try:
1050 thisref = thisref[ :thisref.rindex( b".fa" ) ]
1051 except ValueError:
1052 pass
1054 return ( thisref, thisstart, thisstrand )
1056 cdef class BAMParser( GenericParser ):
1057 """File Parser Class for BAM File.
1059 File is gzip-compatible and binary.
1060 Information available is the same that is in SAM format.
1062 The bitwise flag is made like this:
1063 dec meaning
1064 --- -------
1065 1 paired read
1066 2 proper pair
1067 4 query unmapped
1068 8 mate unmapped
1069 16 strand of the query (1 -> reverse)
1070 32 strand of the mate
1071 64 first read in pair
1072 128 second read in pair
1073 256 alignment is not primary
1074 512 does not pass quality check
1075 1024 PCR or optical duplicate
1076 2048 supplementary alignment
1078 def __init__ ( self, str filename, int64_t buffer_size = 100000 ):
1079 """Open input file. Determine whether it's a gzipped file.
1081 'filename' must be a string object.
1083 This function initialize the following attributes:
1085 1. self.filename: the filename for input file.
1086 2. self.gzipped: a boolean indicating whether input file is gzipped.
1087 3. self.fhd: buffered I/O stream of input file
1089 self.filename = filename
1090 self.gzipped = True
1091 self.tag_size = -1
1092 self.buffer_size = buffer_size
1093 # try gzip first
1094 f = gzip.open( filename )
1095 try:
1096 f.read( 10 )
1097 except IOError:
1098 # not a gzipped file
1099 self.gzipped = False
1100 f.close()
1101 if self.gzipped:
1102 # open with gzip.open, then wrap it with BufferedReader!
1103 self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE) # buffersize set to 1M
1104 else:
1105 self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!
1107 cpdef sniff( self ):
1108 """Check the first 3 bytes of BAM file. If it's 'BAM', check
1109 is success.
1112 magic_header = self.fhd.read( 3 )
1113 if magic_header == b"BAM":
1114 tsize = self.tsize()
1115 if tsize > 0:
1116 self.fhd.seek( 0 )
1117 return True
1118 else:
1119 self.fhd.seek( 0 )
1120 raise Exception( "File is not of a valid BAM format! %d" % tsize )
1121 else:
1122 self.fhd.seek( 0 )
1123 return False
1125 cpdef int32_t tsize ( self ):
1126 """Get tag size from BAM file -- read l_seq field.
1128 Refer to: http://samtools.sourceforge.net/SAM1.pdf
1130 * This may not work for BAM file from bedToBAM (bedtools),
1131 since the l_seq field seems to be 0.
1134 cdef:
1135 int32_t x, header_len, nc, nlength
1136 int32_t n = 0 # successful read of tag size
1137 float64_t s = 0 # sum of tag sizes
1139 if self.tag_size != -1:
1140 # if we have already calculated tag size (!= -1), return it.
1141 return self.tag_size
1143 fseek = self.fhd.seek
1144 fread = self.fhd.read
1145 ftell = self.fhd.tell
1146 # move to pos 4, there starts something
1147 fseek( 4 )
1148 header_len = unpack( '<i', fread( 4 ) )[ 0 ]
1149 fseek( header_len + ftell() )
1150 # get the number of chromosome
1151 nc = unpack( '<i', fread( 4 ) )[ 0 ]
1152 for x in range( nc ):
1153 # read each chromosome name
1154 nlength = unpack( '<i' , fread( 4 ) )[ 0 ]
1155 # jump over chromosome size, we don't need it
1156 fread( nlength )
1157 fseek( ftell() + 4 )
1158 while n < 10:
1159 entrylength = unpack( '<i', fread( 4 ) )[ 0 ]
1160 data = fread( entrylength )
1161 a = unpack( '<i', data[16:20] )[ 0 ]
1162 s += a
1163 n += 1
1164 fseek( 0 )
1165 self.tag_size = <int32_t>(s/n)
1166 return self.tag_size
1168 cpdef tuple get_references( self ):
1170 read in references from BAM header
1172 return a tuple (references (list of names),
1173 rlengths (dict of lengths)
1175 cdef:
1176 int32_t header_len, x, nc, nlength
1177 bytes refname
1178 list references = []
1179 dict rlengths = {}
1181 fseek = self.fhd.seek
1182 fread = self.fhd.read
1183 ftell = self.fhd.tell
1184 # move to pos 4, there starts something
1185 fseek(4)
1186 header_len = unpack( '<i', fread( 4 ) )[ 0 ]
1187 fseek( header_len + ftell() )
1188 # get the number of chromosome
1189 nc = unpack( '<i', fread( 4 ) )[ 0 ]
1190 for x in range( nc ):
1191 # read each chromosome name
1192 nlength = unpack( '<i', fread( 4 ) )[ 0 ]
1193 refname = fread( nlength )[ :-1 ]
1194 references.append( refname )
1195 # don't jump over chromosome size
1196 # we can use it to avoid falling of chrom ends during peak calling
1197 rlengths[refname] = unpack( '<i', fread( 4 ) )[ 0 ]
1198 return (references, rlengths)
1200 cpdef build_fwtrack ( self ):
1201 """Build FWTrack from all lines, return a FWTrack object.
1203 Note only the unique match for a tag is kept.
1205 cdef:
1206 int64_t i = 0 #number of reads kept
1207 int32_t entrylength, fpos, strand, chrid
1208 list references
1209 dict rlengths
1211 fwtrack = FWTrack( buffer_size = self.buffer_size )
1212 references, rlengths = self.get_references() # after this, ptr at list of alignments
1213 fseek = self.fhd.seek
1214 fread = self.fhd.read
1215 ftell = self.fhd.tell
1217 while True:
1218 try:
1219 entrylength = unpack( "<i", fread( 4 ) )[0]
1220 except struct.error:
1221 break
1222 ( chrid, fpos, strand ) = bam_se_entry_parser( fread( entrylength ) )
1223 if chrid == -1: continue
1224 fwtrack.add_loc( references[ chrid ], fpos, strand )
1225 i += 1
1226 if i % 1000000 == 0:
1227 info( " %d reads parsed" % i )
1229 #print( f"{references[chrid]:},{fpos:},{strand:}" )
1230 info( "%d reads have been read." % i )
1231 self.fhd.close()
1232 fwtrack.set_rlengths( rlengths )
1233 return fwtrack
1235 cpdef append_fwtrack ( self, fwtrack ):
1236 """Build FWTrack from all lines, return a FWTrack object.
1238 Note only the unique match for a tag is kept.
1240 cdef:
1241 int64_t i = 0 #number of reads kept
1242 int32_t entrylength, fpos, strand, chrid
1243 list references
1244 dict rlengths
1246 references, rlengths = self.get_references()
1247 fseek = self.fhd.seek
1248 fread = self.fhd.read
1249 ftell = self.fhd.tell
1251 while True:
1252 try:
1253 entrylength = unpack( '<i', fread( 4 ) )[ 0 ]
1254 except struct.error:
1255 break
1256 ( chrid, fpos, strand ) = bam_se_entry_parser( fread( entrylength ) )
1257 if chrid == -1: continue
1258 fwtrack.add_loc( references[ chrid ], fpos, strand )
1259 i += 1
1260 if i % 1000000 == 0:
1261 info( " %d reads parsed" % i )
1263 info( "%d reads have been read." % i )
1264 self.fhd.close()
1265 #fwtrack.finalize()
1266 # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
1267 fwtrack.set_rlengths( rlengths )
1268 return fwtrack
1270 cdef class BAMPEParser(BAMParser):
1271 """File Parser Class for BAM File containing paired-end reads
1272 Only counts valid pairs, discards everything else
1273 Uses the midpoint of every read and calculates the average fragment size
1274 on the fly instead of modeling it
1276 File is gzip-compatible and binary.
1277 Information available is the same that is in SAM format.
1279 The bitwise flag is made like this:
1280 dec meaning
1281 --- -------
1282 1 paired read
1283 2 proper pair
1284 4 query unmapped
1285 8 mate unmapped
1286 16 strand of the query (1 -> reverse)
1287 32 strand of the mate
1288 64 first read in pair
1289 128 second read in pair
1290 256 alignment is not primary
1291 512 does not pass quality check
1292 1024 PCR or optical duplicate
1293 2048 supplementary alignment
1295 cdef public int32_t n # total number of fragments
1296 cdef public float32_t d # the average length of fragments
1298 cpdef object build_petrack ( self ):
1299 """Build PETrackI from all lines, return a FWTrack object.
1301 cdef:
1302 int64_t i # number of fragments kept
1303 int64_t m # sum of fragment lengths
1304 int32_t entrylength, fpos, chrid, tlen
1305 list references
1306 dict rlengths
1308 i = 0
1309 m = 0
1311 petrack = PETrackI( buffer_size = self.buffer_size )
1312 references, rlengths = self.get_references()
1313 fseek = self.fhd.seek
1314 fread = self.fhd.read
1315 ftell = self.fhd.tell
1317 # for convenience, only count valid pairs
1318 add_loc = petrack.add_loc
1319 err = struct.error
1321 while True:
1322 try:
1323 entrylength = unpack( '<i', fread(4) )[0]
1324 except err:
1325 #e1 += 1
1326 break
1327 ( chrid, fpos, tlen ) = bampe_pe_entry_parser( fread(entrylength) )
1328 if chrid == -1:
1329 #e2 += 1
1330 continue
1331 add_loc(references[ chrid ], fpos, fpos + tlen)
1332 m += tlen
1333 i += 1
1334 if i % 1000000 == 0:
1335 info( " %d fragments parsed" % i )
1337 #print( f"{references[chrid]:},{fpos:},{tlen:}" )
1338 info( "%d fragments have been read." % i )
1339 #debug( f" {e1} Can't identify the length of entry, it may be the end of file, stop looping..." )
1340 #debug( f" {e2} Chromosome name can't be found which means this entry is skipped ..." )
1341 #assert i > 0, "Something went wrong, no fragment has been read! Check input file!"
1342 self.d = m / i
1343 self.n = i
1344 #assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
1345 self.fhd.close()
1346 petrack.set_rlengths( rlengths )
1347 return petrack
1349 cpdef append_petrack (self, petrack):
1350 """Build PETrackI from all lines, return a PETrackI object.
1352 cdef:
1353 int64_t i = 0 # number of fragments kept
1354 int64_t m = 0 # sum of fragment lengths
1355 int32_t entrylength, fpos, chrid, tlen
1356 list references
1357 dict rlengths
1359 references, rlengths = self.get_references()
1360 fseek = self.fhd.seek
1361 fread = self.fhd.read
1362 ftell = self.fhd.tell
1364 # for convenience, only count valid pairs
1365 add_loc = petrack.add_loc
1366 err = struct.error
1368 while True:
1369 try:
1370 entrylength = unpack('<i', fread(4))[0]
1371 except err:
1372 break
1373 ( chrid, fpos, tlen ) = bampe_pe_entry_parser( fread(entrylength) )
1374 if chrid == -1:
1375 continue
1376 add_loc(references[ chrid ], fpos, fpos + tlen)
1377 m += tlen
1378 i += 1
1379 if i % 1000000 == 0:
1380 info(" %d fragments parsed" % i)
1382 info( "%d fragments have been read." % i )
1383 self.d = ( self.d * self.n + m ) / ( self.n + i )
1384 self.n += i
1385 #assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
1386 self.fhd.close()
1387 # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
1388 # petrack.finalize()
1389 petrack.set_rlengths( rlengths )
1390 return petrack
1392 cdef class BowtieParser( GenericParser ):
1393 """File Parser Class for map files from Bowtie or MAQ's maqview
1394 program.
1397 cdef int32_t __tlen_parse_line ( self, bytes thisline ):
1398 """Parse tag sequence, then tag length.
1401 cdef list thisfields
1403 thisline = thisline.rstrip()
1404 if not thisline: return ( b"", -1, -1 )
1405 if thisline[ 0 ] == b"#": return ( b"", -1 , -1 ) # comment line is skipped
1406 thisfields = thisline.split( b'\t' ) # I hope it will never bring me more trouble
1407 return len( thisfields[ 4 ] )
1409 cdef tuple __fw_parse_line (self, bytes thisline ):
1411 The following definition comes from bowtie website:
1413 The bowtie aligner outputs each alignment on a separate
1414 line. Each line is a collection of 8 fields separated by tabs;
1415 from left to right, the fields are:
1417 1. Name of read that aligned
1419 2. Orientation of read in the alignment, - for reverse
1420 complement, + otherwise
1422 3. Name of reference sequence where alignment occurs, or
1423 ordinal ID if no name was provided
1425 4. 0-based offset into the forward reference strand where
1426 leftmost character of the alignment occurs
1428 5. Read sequence (reverse-complemented if orientation is -)
1430 6. ASCII-encoded read qualities (reversed if orientation is
1431 -). The encoded quality values are on the Phred scale and the
1432 encoding is ASCII-offset by 33 (ASCII char !).
1434 7. Number of other instances where the same read aligns
1435 against the same reference characters as were aligned against
1436 in this alignment. This is not the number of other places the
1437 read aligns with the same number of mismatches. The number in
1438 this column is generally not a good proxy for that number
1439 (e.g., the number in this column may be '0' while the number
1440 of other alignments with the same number of mismatches might
1441 be large). This column was previously described as 'Reserved'.
1443 8. Comma-separated list of mismatch descriptors. If there are
1444 no mismatches in the alignment, this field is empty. A single
1445 descriptor has the format offset:reference-base>read-base. The
1446 offset is expressed as a 0-based offset from the high-quality
1447 (5') end of the read.
1450 cdef:
1451 list thisfields
1452 bytes chromname
1454 thisline = thisline.rstrip()
1455 if not thisline: return ( b"", -1, -1 )
1456 if thisline[ 0 ] == b"#": return ( b"", -1, -1 ) # comment line is skipped
1457 thisfields = thisline.split( b'\t' ) # I hope it will never bring me more trouble
1459 chromname = thisfields[ 2 ]
1460 try:
1461 chromname = chromname[ :chromname.rindex( b".fa" ) ]
1462 except ValueError:
1463 pass
1465 if thisfields[ 1 ] == b"+":
1466 return ( chromname,
1467 atoi( thisfields[ 3 ] ),
1469 elif thisfields[ 1 ] == b"-":
1470 return ( chromname,
1471 atoi( thisfields[ 3 ] ) + strlen( thisfields[ 4 ] ),
1473 else:
1474 raise StrandFormatError( thisline, thisfields[ 1 ] )