MACS3/IO/Parser.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # cython: linetrace=True
   4 # Time-stamp: <2020-12-06 23:32:28 Tao Liu>
   5
   6 """Module for all MACS Parser classes for input.
   7
   8 This code is free software; you can redistribute it and/or modify it
   9 under the terms of the BSD License (see the file LICENSE included with
  10 the distribution).
  11 """
  12
  13 # ------------------------------------
  14 # python modules
  15 # ------------------------------------
  16 import logging
  17 from logging import info, debug
  18 import struct
  19 from struct import unpack
  20 from re import findall
  21 import gzip
  22 import io
  23 import sys
  24
  25 # ------------------------------------
  26 # MACS3 modules
  27 # ------------------------------------
  28
  29 from MACS3.Utilities.Constants import *
  30 from MACS3.Signal.FixWidthTrack import FWTrack
  31 from MACS3.Signal.PairedEndTrack import PETrackI
  32
  33 # ------------------------------------
  34 # Other modules
  35 # ------------------------------------
  36 from cpython cimport bool
  37
  38 import numpy as np
  39 cimport numpy as np
  40 from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
  41
  42 is_le = sys.byteorder == "little"
  43
  44 cdef extern from "stdlib.h":
  45     ctypedef uint32_t size_t
  46     size_t strlen(char *s)
  47     void *malloc(size_t size)
  48     void *calloc(size_t n, size_t size)
  49     void free(void *ptr)
  50     int32_t strcmp(char *a, char *b)
  51     char * strcpy(char *a, char *b)
  52     int64_t atol(char *str)
  53     int32_t atoi(char *str)
  54
  55 # ------------------------------------
  56 # constants
  57 # ------------------------------------
  58 __version__ = "Parser $Revision$"
  59 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
  60 __doc__ = "All Parser classes"
  61
  62 # ------------------------------------
  63 # Misc functions
  64 # ------------------------------------
  65
  66 cpdef guess_parser ( fname, int64_t buffer_size = 100000 ):
  67     # Note: BAMPE and BEDPE can't be automatically detected.
  68     ordered_parser_dict = {"BAM":BAMParser,
  69                            "BED":BEDParser,
  70                            "ELAND":ELANDResultParser,
  71                            "ELANDMULTI":ELANDMultiParser,
  72                            "ELANDEXPORT":ELANDExportParser,
  73                            "SAM":SAMParser,
  74                            "BOWTIE":BowtieParser}
  75
  76     for f in ordered_parser_dict:
  77         p = ordered_parser_dict[ f ]
  78         t_parser = p( fname, buffer_size = buffer_size )
  79         debug( "Testing format %s" % f )
  80         s = t_parser.sniff()
  81         if s:
  82             info( "Detected format is: %s" % ( f ) )
  83             if t_parser.is_gzipped():
  84                 info( "* Input file is gzipped." )
  85             return t_parser
  86         else:
  87             t_parser.close()
  88     raise Exception( "Can't detect format!" )
  89
  90 cdef tuple __bam_fw_binary_parse_le ( const unsigned char * data ):
  91     """Parse a BAM SE entry in little endian system
  92     """
  93     cdef:
  94         int32_t thisref, thisstart, thisstrand
  95         uint16_t bwflag
  96         uint8_t l_read_name
  97         uint16_t n_cigar_op
  98         int32_t cigar_code
  99         uint8_t *ui8
 100         int32_t *i32
 101         uint16_t *ui16
 102         uint32_t *ui32
 103
 104     # we skip lot of the available information in data (i.e. tag name, quality etc etc)
 105     # no data, return, does it really happen without raising struct.error?
 106     if not data: return ( -1, -1, -1 )
 107
 108     ui16 = <uint16_t *>data
 109     bwflag = ui16[7]
 110
 111     # we filter out unmapped sequence or bad sequence or  secondary or supplementary alignment
 112     # we filter out 2nd mate, not a proper pair, mate is unmapped
 113     if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)): return ( -1, -1, -1 )
 114
 115     i32 = <int32_t *>data
 116     thisref = i32[0]
 117     thisstart = i32[1]
 118     n_cigar_op = ui16[6]
 119     # In case of paired-end we have now skipped all possible "bad" pairs
 120     # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
 121     # we can treat it as a single read, so just check the strand and calculate its
 122     # start position... hope I'm right!
 123     if bwflag & 16:
 124         # read mapped to minus strand; then we have to compute cigar to find the rightmost position
 125         ui8 = <uint8_t *>data
 126         l_read_name = ui8[8]
 127         # need to decipher CIGAR string
 128         ui32 = <uint32_t *>(data + 32 + l_read_name) # move pointer to cigar_code
 129         for cigar_code in ui32[:n_cigar_op]:#unpack( '<%dI' % (n_cigar_op) , data[ 32 + l_read_name : 32 + l_read_name + n_cigar_op*4 ] ):
 130             if cigar_code & 15 in [ 0, 2, 3, 7, 8 ]:   # they are CIGAR op M/D/N/=/X
 131                 thisstart += cigar_code >> 4
 132         thisstrand = 1
 133     else:
 134         thisstrand = 0
 135
 136     return ( thisref, thisstart, thisstrand )
 137
 138 cdef tuple __bam_fw_binary_parse_be ( const unsigned char * data ):
 139     """Big endian version. We need byte swap.
 140     """
 141     cdef:
 142         int32_t thisref, thisstart, thisstrand
 143         uint16_t bwflag
 144         uint8_t l_read_name
 145         uint16_t n_cigar_op
 146         int32_t cigar_code
 147         uint8_t *ui8                      # we will only cast 1 byte at a time
 148         int32_t i
 149         uint32_t shift0, shift
 150
 151
 152     # we skip lot of the available information in data (i.e. tag name, quality etc etc)
 153     # no data, return, does it really happen without raising struct.error?
 154     if not data: return ( -1, -1, -1 )
 155
 156     ui8 = <uint8_t *>data
 157     bwflag = ui8[15] << 8 | ui8[14]       # it works as bwflag = ui16[7] in little-endian
 158
 159     # we filter out unmapped sequence or bad sequence or  secondary or supplementary alignment
 160     # we filter out 2nd mate, not a proper pair, mate is unmapped
 161     if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)): return ( -1, -1, -1 )
 162
 163     # the following three lins are for little-endian
 164     #thisref = i32[0]
 165     #thisstart = i32[1]
 166     #n_cigar_op = ui16[6]
 167
 168     # to simplify the byte swap, we pretend all original numbers (thisref, pos, nextpos) positive
 169     thisref = ui8[3] << 24 | ui8[2] << 16 | ui8[1] << 8 | ui8[0]
 170     thisstart = ui8[7] << 24 | ui8[6] << 16 | ui8[5] << 8 | ui8[4]
 171     n_cigar_op = ui8[13] << 8 | i8[12]
 172
 173     # In case of paired-end we have now skipped all possible "bad" pairs
 174     # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
 175     # we can treat it as a single read, so just check the strand and calculate its
 176     # start position... hope I'm right!
 177     if bwflag & 16:
 178         # read mapped to minus strand; then we have to compute cigar to find the rightmost position
 179         l_read_name = ui8[8]
 180         # need to decipher CIGAR string
 181         # move pointer to cigar_code
 182         shift0 = 32 + l_read_name
 183         for i in range(n_cigar_op):
 184             shift = shift0 + i*4          # move 32bit at a time
 185             cigar_code = ui8[shift0+3] << 24 | ui8[shift0+2] << 16 | ui8[shift0+1] << 8 | ui8[shift0] # it works like cigar_code = ui32[...] in little-endian
 186             if cigar_code & 15 in [ 0, 2, 3, 7, 8 ]:   # they are CIGAR op M/D/N/=/X
 187                 thisstart += cigar_code >> 4
 188         thisstrand = 1
 189     else:
 190         thisstrand = 0
 191
 192     return ( thisref, thisstart, thisstrand )
 193
 194 cdef tuple __bampe_pe_binary_parse_le (const unsigned char * data):
 195     """Parse a BAMPE record in little-endian system.
 196     """
 197     cdef:
 198         int32_t thisref, thisstart, thistlen
 199         int32_t nextpos, pos
 200         uint16_t bwflag
 201         uint8_t *ui8
 202         int32_t *i32
 203         uint16_t *ui16
 204
 205     # we skip lot of the available information in data (i.e. tag name, quality etc etc)
 206     if not data:
 207         return ( -1, -1, -1 )
 208
 209     ui16 = <uint16_t *>data
 210     bwflag = ui16[7]
 211     # we filter out unmapped, bad sequence, secondary/supplementary alignment
 212     # we filter out other mate of paired reads, not a proper pair, or mate is unmapped
 213     if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)):
 214         return ( -1, -1, -1 )
 215
 216     i32 = <int32_t *>data
 217     ui8 = <uint8_t *>data
 218     thisref = i32[0]
 219
 220     pos = i32[1]
 221     nextpos = i32[6]
 222     thistlen = i32[7]
 223     thisstart = min(pos, nextpos) # we keep only the leftmost
 224                                   # position which means this must
 225                                   # be at + strand. So we don't
 226                                   # need to decipher CIGAR string.
 227     thistlen = abs( thistlen )                    # Actually, if
 228     #                                             # the value
 229     #                                             # unpacked is
 230     #                                             # negative, then
 231     #                                             # nextpos is the
 232     #                                             # leftmost
 233     #                                             # position.
 234
 235     return ( thisref, thisstart, thistlen )
 236
 237 cdef tuple __bampe_pe_binary_parse_be (const unsigned char * data):
 238     """Parse a BAMPE record in big-endian system. And we need byte swap.
 239     """
 240     cdef:
 241         int32_t thisref, thisstart, thistlen
 242         uint32_t tmp_thistlen
 243         int32_t nextpos, pos
 244         uint16_t bwflag
 245         uint8_t *ui8                      # we will only cast 1 byte at a time
 246
 247     # we skip lot of the available information in data (i.e. tag name, quality etc etc)
 248     if not data:
 249         return ( -1, -1, -1 )
 250
 251     ui8 = <uint8_t *>data
 252
 253     bwflag = ui8[15] << 8 | ui8[14]       # as le: bwflag = ui16[7]
 254     # we filter out unmapped, bad sequence, secondary/supplementary alignment
 255     # we filter out other mate of paired reads, not a proper pair, or mate is unmapped
 256     if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)):
 257         return ( -1, -1, -1 )
 258
 259     i8 = <int8_t *>data
 260
 261     # the following three lins are for little-endian
 262
 263
 264
 265     # to simplify the byte swap, we pretend all original numbers (thisref, pos, nextpos) positive
 266     thisref = ui8[3] << 24 | ui8[2] << 16 | ui8[1] << 8 | ui8[0]   # as le:thisref = i32[0]
 267     pos = ui8[7] << 24 | ui8[6] << 16 | ui8[5] << 8 | ui8[4]       # as le:pos = i32[1]
 268     nextpos = ui8[27] << 24 | ui8[26] << 16 | ui8[25] << 8 | ui8[24] # as le:nextpos = i32[6]
 269
 270     # thistlen can be negative, so we byte swap it then convert to int32_t then take abs (maybe there is more effecient way?)
 271     tmp_thistlen = ui8[31] << 24 | ui8[30] << 16 | ui8[29] << 8 | ui8[28] # as le:tmp_thistlen = ui32[7]
 272     thistlen = abs(<int32_t> tmp_thistlen)
 273
 274     # position which means this must
 275     # be at + strand. So we don't
 276     # need to decipher CIGAR string.
 277     thisstart = pos if nextpos > pos else nextpos #min(pos, nextpos) # we keep only the leftmost
 278
 279     return ( thisref, thisstart, thistlen )
 280
 281
 282 # choose a parser according to endian
 283 if is_le:
 284     bam_se_entry_parser = __bam_fw_binary_parse_le
 285     bampe_pe_entry_parser = __bampe_pe_binary_parse_le
 286 else:
 287     bam_se_entry_parser = __bam_fw_binary_parse_be
 288     bampe_pe_entry_parser = __bampe_pe_binary_parse_be
 289
 290 # ------------------------------------
 291 # Classes
 292 # ------------------------------------
 293 class StrandFormatError( Exception ):
 294     """Exception about strand format error.
 295
 296     Example:
 297     raise StrandFormatError('Must be F or R','X')
 298     """
 299     def __init__ ( self, string, strand ):
 300         self.strand = strand
 301         self.string = string
 302
 303     def __str__ ( self ):
 304         return repr( "Strand information can not be recognized in this line: \"%s\",\"%s\"" % ( self.string, self.strand ) )
 305
 306 cdef class GenericParser:
 307     """Generic Parser class.
 308
 309     Inherit this class to write your own parser. In most cases, you need to override:
 310
 311     1. __tlen_parse_line which returns tag length of a line
 312     2.  __fw_parse_line which returns tuple of ( chromosome, 5'position, strand )
 313     """
 314     cdef:
 315         str filename
 316         bool gzipped
 317         int32_t tag_size
 318         object fhd
 319         int64_t buffer_size
 320
 321     def __init__ ( self, str filename, int64_t buffer_size = 100000 ):
 322         """Open input file. Determine whether it's a gzipped file.
 323
 324         'filename' must be a string object.
 325
 326         This function initialize the following attributes:
 327
 328         1. self.filename: the filename for input file.
 329         2. self.gzipped: a boolean indicating whether input file is gzipped.
 330         3. self.fhd: buffered I/O stream of input file
 331         """
 332         self.filename = filename
 333         self.gzipped = True
 334         self.tag_size = -1
 335         self.buffer_size = buffer_size
 336         # try gzip first
 337         f = gzip.open( filename )
 338         try:
 339             f.read( 10 )
 340         except IOError:
 341             # not a gzipped file
 342             self.gzipped = False
 343         f.close()
 344         if self.gzipped:
 345             # open with gzip.open, then wrap it with BufferedReader!
 346             self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE ) # buffersize set to 10M
 347         else:
 348             self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!
 349         self.__skip_first_commentlines()
 350
 351     cdef void __skip_first_commentlines ( self ):
 352         """Some parser needs to skip the first several comment lines.
 353
 354         Redefine this if it's necessary!
 355         """
 356         return
 357
 358     cpdef int32_t tsize( self ):
 359         """General function to detect tag size.
 360
 361         * Although it can be used by most parsers, it must be
 362           rewritten by BAMParser!
 363         """
 364         cdef:
 365             int32_t s = 0
 366             int32_t n = 0     # number of successful/valid read alignments
 367             int32_t m = 0     # number of trials
 368             int32_t this_taglength
 369             bytes thisline
 370
 371         if self.tag_size != -1:
 372             # if we have already calculated tag size (!= -1),  return it.
 373             return self.tag_size
 374
 375         # try 10k times or retrieve 10 successfule alignments
 376         while n < 10 and m < 10000:
 377             m += 1
 378             thisline = self.fhd.readline()
 379             this_taglength = self.__tlen_parse_line( thisline )
 380             if this_taglength > 0:
 381                 # this_taglength == 0 means this line doesn't contain
 382                 # successful alignment.
 383                 s += this_taglength
 384                 n += 1
 385         # done
 386         self.fhd.seek( 0 )
 387         self.__skip_first_commentlines()
 388         if n != 0:              # else tsize = -1
 389             self.tag_size = <int32_t>(s/n)
 390         return self.tag_size
 391
 392     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
 393         """Abstract function to detect tag length.
 394
 395         """
 396         raise NotImplemented
 397
 398     cpdef build_fwtrack ( self ):
 399         """Generic function to build FWTrack object. Create a new
 400         FWTrack object. If you want to append new records to an
 401         existing FWTrack object, try append_fwtrack function.
 402
 403         * BAMParser for binary BAM format should have a different one.
 404         """
 405         cdef:
 406             int64_t i, fpos, strand
 407             bytes chromosome
 408             bytes tmp = b""
 409
 410         fwtrack = FWTrack( buffer_size = self.buffer_size )
 411         i = 0
 412         while True:
 413             # for each block of input
 414             tmp += self.fhd.read( READ_BUFFER_SIZE )
 415             if not tmp:
 416                 break
 417             lines = tmp.split(b"\n")
 418             tmp = lines[ -1 ]
 419             for thisline in lines[ :-1 ]:
 420                 ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline )
 421                 if fpos < 0 or not chromosome:
 422                     # normally __fw_parse_line will return -1 if the line
 423                     # contains no successful alignment.
 424                     continue
 425                 i += 1
 426                 if i % 1000000 == 0:
 427                     info( " %d reads parsed" % i )
 428                 fwtrack.add_loc( chromosome, fpos, strand )
 429         # last one
 430         if tmp:
 431             ( chromosome, fpos, strand ) = self.__fw_parse_line( tmp )
 432             if fpos >= 0 and chromosome:
 433                 i += 1
 434                 fwtrack.add_loc( chromosome, fpos, strand )
 435         # close file stream.
 436         self.close()
 437         return fwtrack
 438
 439     cpdef append_fwtrack ( self, fwtrack ):
 440         """Add more records to an existing FWTrack object.
 441
 442         """
 443         cdef:
 444             int64_t i, fpos, strand
 445             bytes chromosome
 446             bytes tmp = b""
 447         i = 0
 448         while True:
 449             # for each block of input
 450             tmp += self.fhd.read( READ_BUFFER_SIZE )
 451             if not tmp:
 452                 break
 453             lines = tmp.split(b"\n")
 454             tmp = lines[ -1 ]
 455             for thisline in lines[ :-1 ]:
 456                 ( chromosome, fpos, strand ) = self.__fw_parse_line( thisline )
 457                 if fpos < 0 or not chromosome:
 458                     # normally __fw_parse_line will return -1 if the line
 459                     # contains no successful alignment.
 460                     continue
 461                 i += 1
 462                 if i % 1000000 == 0:
 463                     info( " %d reads parsed" % i )
 464                 fwtrack.add_loc( chromosome, fpos, strand )
 465
 466         # last one
 467         if tmp:
 468             ( chromosome, fpos, strand ) = self.__fw_parse_line( tmp )
 469             if fpos >= 0 and chromosome:
 470                 i += 1
 471                 fwtrack.add_loc( chromosome, fpos, strand )
 472         # close file stream.
 473         self.close()
 474         return fwtrack
 475
 476     cdef tuple __fw_parse_line ( self, bytes thisline ):
 477         """Abstract function to parse chromosome, 5' end position and
 478         strand.
 479
 480         """
 481         cdef bytes chromosome = b""
 482         cdef int32_t fpos = -1
 483         cdef int32_t strand = -1
 484         return ( chromosome, fpos, strand )
 485
 486     cpdef sniff ( self ):
 487         """Detect whether this parser is the correct parser for input
 488         file.
 489
 490         Rule: try to find the tag size using this parser, if error
 491         occurs or tag size is too small or too big, check is failed.
 492
 493         * BAMParser has a different sniff function.
 494         """
 495         cdef int32_t t
 496
 497         t = self.tsize()
 498         if t <= 10 or t >= 10000: # tsize too small or too big
 499             self.fhd.seek( 0 )
 500             return False
 501         else:
 502             self.fhd.seek( 0 )
 503             self.__skip_first_commentlines()
 504             return True
 505
 506     cpdef close ( self ):
 507         """Run this when this Parser will be never used.
 508
 509         Close file I/O stream.
 510         """
 511         self.fhd.close()
 512
 513     cpdef bool is_gzipped ( self ):
 514         return self.gzipped
 515
 516 cdef class BEDParser( GenericParser ):
 517     """File Parser Class for BED File.
 518
 519     """
 520     cdef void __skip_first_commentlines ( self ):
 521         """BEDParser needs to skip the first several comment lines.
 522         """
 523         cdef:
 524             int32_t l_line
 525             bytes this_line
 526         for thisline in self.fhd:
 527             l_line = len( thisline )
 528             if thisline and ( thisline[ :5 ] != b"track" ) \
 529                and ( thisline[ :7 ] != b"browser" ) \
 530                and ( thisline[ 0 ] != 35 ): # 35 is b"#"
 531                 break
 532
 533         # rewind from SEEK_CUR
 534         self.fhd.seek( -l_line, 1 )
 535         return
 536
 537     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
 538         """Parse 5' and 3' position, then calculate frag length.
 539
 540         """
 541         thisline = thisline.rstrip()
 542         if not thisline:
 543             return 0
 544
 545         thisfields = thisline.split( b'\t' )
 546         return atoi( thisfields[ 2 ] )-atoi( thisfields[ 1 ] )
 547
 548     cdef tuple __fw_parse_line ( self, bytes thisline ):
 549         #cdef list thisfields
 550         cdef:
 551             bytes chromname
 552             list thisfields
 553
 554         thisline = thisline.rstrip()
 555         thisfields = thisline.split( b'\t' )
 556         chromname = thisfields[ 0 ]
 557         try:
 558             #if not strcmp(thisfields[ 5 ],b"+"):
 559             if thisfields[5] == b"+":
 560                 return ( chromname,
 561                          atoi( thisfields[ 1 ] ),
 562                          0 )
 563             #elif not strcmp(thisfields[ 5 ], b"-"):
 564             elif thisfields[5] == b"-":
 565                 return ( chromname,
 566                          atoi( thisfields[ 2 ] ),
 567                          1 )
 568             else:
 569                 raise StrandFormatError( thisline, thisfields[ 5 ] )
 570         except IndexError:
 571             # default pos strand if no strand
 572             # info can be found
 573             return ( chromname,
 574                      atoi( thisfields[ 1 ] ),
 575                      0 )
 576
 577 cdef class BEDPEParser(GenericParser):
 578     """Parser for BED format file containing PE information, and also
 579     can be used for the cases when users predefine the fragment
 580     locations by shifting/extending by themselves.
 581
 582     Format:
 583
 584     chromosome_name     frag_leftend    frag_rightend
 585
 586
 587     Note: Only the first columns are used!
 588     """
 589
 590     cdef public int32_t n
 591     cdef public float32_t d
 592
 593     cdef void __skip_first_commentlines ( self ):
 594         """BEDPEParser needs to skip the first several comment lines.
 595         """
 596         cdef:
 597             int32_t l_line
 598             bytes this_line
 599         for thisline in self.fhd:
 600             l_line = len( thisline )
 601             if thisline and ( thisline[ :5 ] != b"track" ) \
 602                and ( thisline[ :7 ] != b"browser" ) \
 603                and ( thisline[ 0 ] != 35 ): # 35 is b"#"
 604                 break
 605
 606         # rewind from SEEK_CUR
 607         self.fhd.seek( -l_line, 1 )
 608         return
 609
 610     cdef __pe_parse_line ( self, bytes thisline ):
 611         """ Parse each line, and return chromosome, left and right positions
 612
 613         """
 614         cdef:
 615             list thisfields
 616
 617         thisline = thisline.rstrip()
 618
 619         # still only support tabular as delimiter.
 620         thisfields = thisline.split( b'\t' )
 621         try:
 622             return ( thisfields[ 0 ],
 623                      atoi( thisfields[ 1 ] ),
 624                      atoi( thisfields[ 2 ] ) )
 625         except IndexError:
 626             raise Exception("Less than 3 columns found at this line: %s\n" % thisline)
 627
 628     cpdef build_petrack ( self ):
 629         """Build PETrackI from all lines.
 630
 631         """
 632         cdef:
 633             bytes chromname
 634             int32_t left_pos
 635             int32_t right_pos
 636             int64_t i = 0          # number of fragments
 637             int64_t m = 0          # sum of fragment lengths
 638             bytes tmp = b""
 639
 640         petrack = PETrackI( buffer_size = self.buffer_size )
 641         add_loc = petrack.add_loc
 642
 643         while True:
 644             # for each block of input
 645             tmp += self.fhd.read( READ_BUFFER_SIZE )
 646             if not tmp:
 647                 break
 648             lines = tmp.split(b"\n")
 649             tmp = lines[ -1 ]
 650             for thisline in lines[ :-1 ]:
 651                 ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
 652                 if left_pos < 0 or not chromosome:
 653                     continue
 654                 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
 655                 m += right_pos - left_pos
 656                 i += 1
 657                 if i % 1000000 == 0:
 658                     info( " %d fragments parsed" % i )
 659                 add_loc( chromosome, left_pos, right_pos )
 660         # last one
 661         if tmp:
 662             ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
 663             if left_pos >= 0 and chromosome:
 664                 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
 665                 i += 1
 666                 m += right_pos - left_pos
 667                 add_loc( chromosome, left_pos, right_pos )
 668
 669         self.d = <float32_t>( m ) / i
 670         self.n = i
 671         assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
 672
 673         self.close()
 674         petrack.set_rlengths( {"DUMMYCHROM":0} )
 675         return petrack
 676
 677     cpdef append_petrack (self, petrack):
 678         """Build PETrackI from all lines, return a PETrackI object.
 679         """
 680         cdef:
 681             bytes chromname
 682             int32_t left_pos
 683             int32_t right_pos
 684             int64_t i = 0          # number of fragments
 685             int64_t m = 0          # sum of fragment lengths
 686             bytes tmp = b""
 687
 688         add_loc = petrack.add_loc
 689         while True:
 690             # for each block of input
 691             tmp += self.fhd.read( READ_BUFFER_SIZE )
 692             if not tmp:
 693                 break
 694             lines = tmp.split(b"\n")
 695             tmp = lines[ -1 ]
 696             for thisline in lines[ :-1 ]:
 697                 ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
 698                 if left_pos < 0 or not chromosome:
 699                     continue
 700                 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
 701                 m += right_pos - left_pos
 702                 i += 1
 703                 if i % 1000000 == 0:
 704                     info( " %d fragments parsed" % i )
 705                 add_loc( chromosome, left_pos, right_pos )
 706         # last one
 707         if tmp:
 708             ( chromosome, left_pos, right_pos ) = self.__pe_parse_line( thisline )
 709             if left_pos >= 0 and chromosome:
 710                 assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
 711                 i += 1
 712                 m += right_pos - left_pos
 713                 add_loc( chromosome, left_pos, right_pos )
 714
 715         self.d = ( self.d * self.n + m ) / ( self.n + i )
 716         self.n += i
 717
 718         assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
 719         self.close()
 720         petrack.set_rlengths( {"DUMMYCHROM":0} )
 721         return petrack
 722
 723 cdef class ELANDResultParser( GenericParser ):
 724     """File Parser Class for tabular File.
 725
 726     """
 727     cdef void __skip_first_commentlines ( self ):
 728         """ELANDResultParser needs to skip the first several comment lines.
 729         """
 730         cdef:
 731             int32_t l_line
 732             bytes this_line
 733         for thisline in self.fhd:
 734             l_line = len( thisline )
 735             if thisline and thisline[ 0 ] != 35: # 35 is b"#"
 736                 break
 737
 738         # rewind from SEEK_CUR
 739         self.fhd.seek( -l_line, 1 )
 740         return
 741
 742     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
 743         """Parse tag sequence, then tag length.
 744
 745         """
 746         thisline = thisline.rstrip()
 747         if not thisline: return 0
 748         thisfields = thisline.split( b'\t' )
 749         if thisfields[1].isdigit():
 750             return 0
 751         else:
 752             return len( thisfields[ 1 ] )
 753
 754     cdef tuple __fw_parse_line ( self, bytes thisline ):
 755         cdef:
 756             bytes chromname, strand
 757             int32_t thistaglength
 758         thisline = thisline.rstrip()
 759         if not thisline: return ( b"", -1, -1 )
 760
 761         thisfields = thisline.split( b'\t' )
 762         thistaglength = strlen( thisfields[ 1 ] )
 763
 764         if len( thisfields ) <= 6:
 765             return ( b"", -1, -1 )
 766
 767         try:
 768             chromname = thisfields[ 6 ]
 769             chromname = chromname[ :chromname.rindex( b".fa" ) ]
 770         except ValueError:
 771             pass
 772
 773         if thisfields[ 2 ] == b"U0" or thisfields[ 2 ] == b"U1" or thisfields[ 2 ] == b"U2":
 774             # allow up to 2 mismatches...
 775             strand = thisfields[ 8 ]
 776             if strand == b"F":
 777                 return ( chromname,
 778                          atoi( thisfields[ 7 ] ) - 1,
 779                          0 )
 780             elif strand == b"R":
 781                 return ( chromname,
 782                          atoi( thisfields[ 7 ] ) + thistaglength - 1,
 783                          1 )
 784             else:
 785                 raise StrandFormatError( thisline, strand )
 786         else:
 787             return ( b"", -1, -1 )
 788
 789 cdef class ELANDMultiParser( GenericParser ):
 790     """File Parser Class for ELAND multi File.
 791
 792     Note this parser can only work for s_N_eland_multi.txt format.
 793
 794     Each line of the output file contains the following fields:
 795     1. Sequence name
 796     2. Sequence
 797     3. Either NM, QC, RM (as described above) or the following:
 798     4. x:y:z where x, y, and z are the number of exact, single-error, and 2-error matches
 799     found
 800     5. Blank, if no matches found or if too many matches found, or the following:
 801     BAC_plus_vector.fa:163022R1,170128F2,E_coli.fa:3909847R1
 802     This says there are two matches to BAC_plus_vector.fa: one in the reverse direction
 803     starting at position 160322 with one error, one in the forward direction starting at
 804     position 170128 with two errors. There is also a single-error match to E_coli.fa.
 805     """
 806     cdef void __skip_first_commentlines ( self ):
 807         """ELANDResultParser needs to skip the first several comment lines.
 808         """
 809         cdef:
 810             int32_t l_line
 811             bytes this_line
 812         for thisline in self.fhd:
 813             l_line = len( thisline )
 814             if thisline and thisline[ 0 ] != 35: # 35 is b"#"
 815                 break
 816
 817         # rewind from SEEK_CUR
 818         self.fhd.seek( -l_line, 1 )
 819         return
 820
 821     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
 822         """Parse tag sequence, then tag length.
 823
 824         """
 825         thisline = thisline.rstrip()
 826         if not thisline: return 0
 827         thisfields = thisline.split( b'\t' )
 828         if thisfields[1].isdigit():
 829             return 0
 830         else:
 831             return len( thisfields[ 1 ] )
 832
 833     cdef tuple __fw_parse_line ( self, bytes thisline ):
 834         cdef:
 835             list thisfields
 836             bytes thistagname, pos, strand
 837             int32_t thistaglength, thistaghits
 838
 839         if not thisline: return ( b"", -1, -1 )
 840         thisline = thisline.rstrip()
 841         if not thisline: return ( b"", -1, -1 )
 842
 843         thisfields = thisline.split( b'\t' )
 844         thistagname = thisfields[ 0 ]        # name of tag
 845         thistaglength = len( thisfields[ 1 ] ) # length of tag
 846
 847         if len( thisfields ) < 4:
 848             return ( b"", -1, -1 )
 849         else:
 850             thistaghits = sum( [<int32_t>(x) for x in thisfields[ 2 ].split( b':' ) ] )
 851             if thistaghits > 1:
 852                 # multiple hits
 853                 return ( b"", -1, -1 )
 854             else:
 855                 ( chromname, pos ) = thisfields[ 3 ].split( b':' )
 856
 857                 try:
 858                     chromname = chromname[ :chromname.rindex( b".fa" ) ]
 859                 except ValueError:
 860                     pass
 861
 862                 strand  = pos[ -2 ]
 863                 if strand == b"F":
 864                     return ( chromname,
 865                              <int32_t>( pos[ :-2 ] )-1,
 866                              0 )
 867                 elif strand == b"R":
 868                     return ( chromname,
 869                              <int32_t>( pos[ :-2 ] ) + thistaglength - 1,
 870                              1 )
 871                 else:
 872                     raise StrandFormatError( thisline,strand )
 873
 874
 875 cdef class ELANDExportParser( GenericParser ):
 876     """File Parser Class for ELAND Export File.
 877
 878     """
 879     cdef void __skip_first_commentlines ( self ):
 880         """ELANDResultParser needs to skip the first several comment lines.
 881         """
 882         cdef:
 883             int32_t l_line
 884             bytes this_line
 885         for thisline in self.fhd:
 886             l_line = len( thisline )
 887             if thisline and thisline[ 0 ] != 35: # 35 is b"#"
 888                 break
 889
 890         # rewind from SEEK_CUR
 891         self.fhd.seek( -l_line, 1 )
 892         return
 893
 894     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
 895         """Parse tag sequence, then tag length.
 896
 897         """
 898         thisline = thisline.rstrip()
 899         if not thisline: return 0
 900         thisfields = thisline.split( b'\t' )
 901         if len( thisfields ) > 12 and thisfields[ 12 ]:
 902             # a successful alignment has over 12 columns
 903             return len( thisfields[ 8 ] )
 904         else:
 905             return 0
 906
 907     cdef tuple __fw_parse_line ( self, bytes thisline ):
 908         cdef:
 909             list thisfields
 910             bytes thisname, strand
 911             int32_t thistaglength
 912
 913         thisline = thisline.rstrip()
 914         if not thisline: return ( b"", -1, -1 )
 915
 916         thisfields = thisline.split( b"\t" )
 917
 918         if len(thisfields) > 12 and thisfields[ 12 ]:
 919             thisname = b":".join( thisfields[ 0:6 ] )
 920             thistaglength = len( thisfields[ 8 ] )
 921             strand = thisfields[ 13 ]
 922             if strand == b"F":
 923                 return ( thisfields[ 10 ], atoi( thisfields[ 12 ] ) - 1, 0 )
 924             elif strand == b"R":
 925                 return ( thisfields[ 10 ], atoi( thisfields[ 12 ] ) + thistaglength - 1, 1 )
 926             else:
 927                 raise StrandFormatError( thisline, strand )
 928         else:
 929             return ( b"", -1, -1 )
 930
 931 ### Contributed by Davide, modified by Tao
 932 cdef class SAMParser( GenericParser ):
 933     """File Parser Class for SAM File.
 934
 935     Each line of the output file contains at least:
 936     1. Sequence name
 937     2. Bitwise flag
 938     3. Reference name
 939     4. 1-based leftmost position fo clipped alignment
 940     5. Mapping quality
 941     6. CIGAR string
 942     7. Mate Reference Name
 943     8. 1-based leftmost Mate Position
 944     9. Inferred insert size
 945     10. Query sequence on the same strand as the reference
 946     11. Query quality
 947
 948     The bitwise flag is made like this:
 949     dec meaning
 950     --- -------
 951     1   paired read
 952     2   proper pair
 953     4   query unmapped
 954     8   mate unmapped
 955     16  strand of the query (1 -> reverse)
 956     32  strand of the mate
 957     64  first read in pair
 958     128 second read in pair
 959     256 alignment is not primary
 960     512 does not pass quality check
 961     1024        PCR or optical duplicate
 962     2048        supplementary alignment
 963     """
 964
 965     cdef void __skip_first_commentlines ( self ):
 966         """SAMParser needs to skip the first several comment lines.
 967         """
 968         cdef:
 969             int32_t l_line
 970             bytes this_line
 971         for thisline in self.fhd:
 972             l_line = len( thisline )
 973             if thisline and thisline[ 0 ] != 64: # 64 is b"@"
 974                 break
 975
 976         # rewind from SEEK_CUR
 977         self.fhd.seek( -l_line, 1 )
 978         return
 979
 980     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
 981         """Parse tag sequence, then tag length.
 982
 983         """
 984         cdef:
 985             list thisfields
 986             int32_t bwflag
 987
 988         thisline = thisline.rstrip()
 989         if not thisline: return 0
 990         thisfields = thisline.split( b'\t' )
 991         bwflag = atoi( thisfields[ 1 ] )
 992         if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
 993             return 0       #unmapped sequence or bad sequence or 2nd or sup alignment
 994         if bwflag & 1:
 995             # paired read. We should only keep sequence if the mate is mapped
 996             # and if this is the left mate, all is within  the flag!
 997             if not bwflag & 2:
 998                 return 0   # not a proper pair
 999             if bwflag & 8:
1000                 return 0   # the mate is unmapped
1001             # From Benjamin Schiller https://github.com/benjschiller
1002             if bwflag & 128:
1003                 # this is not the first read in a pair
1004                 return 0
1005         return len( thisfields[ 9 ] )
1006
1007     cdef tuple __fw_parse_line ( self, bytes thisline ):
1008         cdef:
1009             list thisfields
1010             bytes thistagname, thisref
1011             int32_t bwflag, thisstrand, thisstart
1012
1013         thisline = thisline.rstrip()
1014         if not thisline: return ( b"", -1, -1 )
1015         thisfields = thisline.split( b'\t' )
1016         thistagname = thisfields[ 0 ]         # name of tag
1017         thisref = thisfields[ 2 ]
1018         bwflag = atoi( thisfields[ 1 ] )
1019         CIGAR = thisfields[ 5 ]
1020
1021         if (bwflag & 2820) or (bwflag & 1 and (bwflag & 136 or not bwflag & 2)): return ( b"", -1, -1 )
1022
1023         #if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
1024         #    return ( b"", -1, -1 )       #unmapped sequence or bad sequence or 2nd or sup alignment
1025         #if bwflag & 1:
1026         #    # paired read. We should only keep sequence if the mate is mapped
1027         #    # and if this is the left mate, all is within  the flag!
1028         #    if not bwflag & 2:
1029         #        return ( b"", -1, -1 )   # not a proper pair
1030         #    if bwflag & 8:
1031         #        return ( b"", -1, -1 )   # the mate is unmapped
1032         #    # From Benjamin Schiller https://github.com/benjschiller
1033         #    if bwflag & 128:
1034         #        # this is not the first read in a pair
1035         #        return ( b"", -1, -1 )
1036         #    # end of the patch
1037         # In case of paired-end we have now skipped all possible "bad" pairs
1038         # in case of proper pair we have skipped the rightmost one... if the leftmost pair comes
1039         # we can treat it as a single read, so just check the strand and calculate its
1040         # start position... hope I'm right!
1041         if bwflag & 16:
1042             # minus strand, we have to decipher CIGAR string
1043             thisstrand = 1
1044             thisstart = atoi( thisfields[ 3 ] ) - 1 + sum( [ <int32_t>(x) for x in findall(b"(\d+)[MDNX=]",CIGAR) ] )   #reverse strand should be shifted alen bp
1045         else:
1046             thisstrand = 0
1047             thisstart = atoi( thisfields[ 3 ] ) - 1
1048
1049         try:
1050             thisref = thisref[ :thisref.rindex( b".fa" ) ]
1051         except ValueError:
1052             pass
1053
1054         return ( thisref, thisstart, thisstrand )
1055
1056 cdef class BAMParser( GenericParser ):
1057     """File Parser Class for BAM File.
1058
1059     File is gzip-compatible and binary.
1060     Information available is the same that is in SAM format.
1061
1062     The bitwise flag is made like this:
1063     dec meaning
1064     --- -------
1065     1   paired read
1066     2   proper pair
1067     4   query unmapped
1068     8   mate unmapped
1069     16  strand of the query (1 -> reverse)
1070     32  strand of the mate
1071     64  first read in pair
1072     128 second read in pair
1073     256 alignment is not primary
1074     512 does not pass quality check
1075     1024        PCR or optical duplicate
1076     2048        supplementary alignment
1077     """
1078     def __init__ ( self, str filename, int64_t buffer_size = 100000 ):
1079         """Open input file. Determine whether it's a gzipped file.
1080
1081         'filename' must be a string object.
1082
1083         This function initialize the following attributes:
1084
1085         1. self.filename: the filename for input file.
1086         2. self.gzipped: a boolean indicating whether input file is gzipped.
1087         3. self.fhd: buffered I/O stream of input file
1088         """
1089         self.filename = filename
1090         self.gzipped = True
1091         self.tag_size = -1
1092         self.buffer_size = buffer_size
1093         # try gzip first
1094         f = gzip.open( filename )
1095         try:
1096             f.read( 10 )
1097         except IOError:
1098             # not a gzipped file
1099             self.gzipped = False
1100         f.close()
1101         if self.gzipped:
1102             # open with gzip.open, then wrap it with BufferedReader!
1103             self.fhd = io.BufferedReader( gzip.open( filename, mode='rb' ), buffer_size = READ_BUFFER_SIZE) # buffersize set to 1M
1104         else:
1105             self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!
1106
1107     cpdef sniff( self ):
1108         """Check the first 3 bytes of BAM file. If it's 'BAM', check
1109         is success.
1110
1111         """
1112         magic_header = self.fhd.read( 3 )
1113         if magic_header == b"BAM":
1114             tsize  = self.tsize()
1115             if tsize > 0:
1116                 self.fhd.seek( 0 )
1117                 return True
1118             else:
1119                 self.fhd.seek( 0 )
1120                 raise Exception( "File is not of a valid BAM format! %d" % tsize )
1121         else:
1122             self.fhd.seek( 0 )
1123             return False
1124
1125     cpdef int32_t tsize ( self ):
1126         """Get tag size from BAM file -- read l_seq field.
1127
1128         Refer to: http://samtools.sourceforge.net/SAM1.pdf
1129
1130         * This may not work for BAM file from bedToBAM (bedtools),
1131         since the l_seq field seems to be 0.
1132         """
1133
1134         cdef:
1135             int32_t x, header_len, nc, nlength
1136             int32_t n = 0                   # successful read of tag size
1137             float64_t s = 0                # sum of tag sizes
1138
1139         if self.tag_size != -1:
1140             # if we have already calculated tag size (!= -1),  return it.
1141             return self.tag_size
1142
1143         fseek = self.fhd.seek
1144         fread = self.fhd.read
1145         ftell = self.fhd.tell
1146         # move to pos 4, there starts something
1147         fseek( 4 )
1148         header_len =  unpack( '<i', fread( 4 ) )[ 0 ]
1149         fseek( header_len + ftell() )
1150         # get the number of chromosome
1151         nc = unpack( '<i', fread( 4 ) )[ 0 ]
1152         for x in range( nc ):
1153             # read each chromosome name
1154             nlength = unpack( '<i' , fread( 4 ) )[ 0 ]
1155             # jump over chromosome size, we don't need it
1156             fread( nlength )
1157             fseek( ftell() + 4 )
1158         while n < 10:
1159             entrylength = unpack( '<i', fread( 4 ) )[ 0 ]
1160             data = fread( entrylength )
1161             a = unpack( '<i', data[16:20] )[ 0 ]
1162             s += a
1163             n += 1
1164         fseek( 0 )
1165         self.tag_size = <int32_t>(s/n)
1166         return self.tag_size
1167
1168     cpdef tuple get_references( self ):
1169         """
1170         read in references from BAM header
1171
1172         return a tuple (references (list of names),
1173                         rlengths (dict of lengths)
1174         """
1175         cdef:
1176             int32_t header_len, x, nc, nlength
1177             bytes refname
1178             list references = []
1179             dict rlengths = {}
1180
1181         fseek = self.fhd.seek
1182         fread = self.fhd.read
1183         ftell = self.fhd.tell
1184         # move to pos 4, there starts something
1185         fseek(4)
1186         header_len =  unpack( '<i', fread( 4 ) )[ 0 ]
1187         fseek( header_len + ftell() )
1188         # get the number of chromosome
1189         nc = unpack( '<i', fread( 4 ) )[ 0 ]
1190         for x in range( nc ):
1191             # read each chromosome name
1192             nlength = unpack( '<i', fread( 4 ) )[ 0 ]
1193             refname = fread( nlength )[ :-1 ]
1194             references.append( refname )
1195             # don't jump over chromosome size
1196             # we can use it to avoid falling of chrom ends during peak calling
1197             rlengths[refname] = unpack( '<i', fread( 4 ) )[ 0 ]
1198         return (references, rlengths)
1199
1200     cpdef build_fwtrack ( self ):
1201         """Build FWTrack from all lines, return a FWTrack object.
1202
1203         Note only the unique match for a tag is kept.
1204         """
1205         cdef:
1206             int64_t i = 0                           #number of reads kept
1207             int32_t entrylength, fpos, strand, chrid
1208             list references
1209             dict rlengths
1210
1211         fwtrack = FWTrack( buffer_size = self.buffer_size )
1212         references, rlengths = self.get_references() # after this, ptr at list of alignments
1213         fseek = self.fhd.seek
1214         fread = self.fhd.read
1215         ftell = self.fhd.tell
1216
1217         while True:
1218             try:
1219                 entrylength = unpack( "<i", fread( 4 ) )[0]
1220             except struct.error:
1221                 break
1222             ( chrid, fpos, strand ) = bam_se_entry_parser( fread( entrylength ) )
1223             if chrid == -1: continue
1224             fwtrack.add_loc( references[ chrid ], fpos, strand )
1225             i += 1
1226             if i % 1000000 == 0:
1227                 info( " %d reads parsed" % i )
1228
1229         #print( f"{references[chrid]:},{fpos:},{strand:}" )
1230         info( "%d reads have been read." % i )
1231         self.fhd.close()
1232         fwtrack.set_rlengths( rlengths )
1233         return fwtrack
1234
1235     cpdef append_fwtrack ( self, fwtrack ):
1236         """Build FWTrack from all lines, return a FWTrack object.
1237
1238         Note only the unique match for a tag is kept.
1239         """
1240         cdef:
1241             int64_t i = 0                     #number of reads kept
1242             int32_t entrylength, fpos, strand, chrid
1243             list references
1244             dict rlengths
1245
1246         references, rlengths = self.get_references()
1247         fseek = self.fhd.seek
1248         fread = self.fhd.read
1249         ftell = self.fhd.tell
1250
1251         while True:
1252             try:
1253                 entrylength = unpack( '<i', fread( 4 ) )[ 0 ]
1254             except struct.error:
1255                 break
1256             ( chrid, fpos, strand ) = bam_se_entry_parser( fread( entrylength ) )
1257             if chrid == -1: continue
1258             fwtrack.add_loc( references[ chrid ], fpos, strand )
1259             i += 1
1260             if i % 1000000 == 0:
1261                 info( " %d reads parsed" % i )
1262
1263         info( "%d reads have been read." % i )
1264         self.fhd.close()
1265         #fwtrack.finalize()
1266         # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
1267         fwtrack.set_rlengths( rlengths )
1268         return fwtrack
1269
1270 cdef class BAMPEParser(BAMParser):
1271     """File Parser Class for BAM File containing paired-end reads
1272     Only counts valid pairs, discards everything else
1273     Uses the midpoint of every read and calculates the average fragment size
1274     on the fly instead of modeling it
1275
1276     File is gzip-compatible and binary.
1277     Information available is the same that is in SAM format.
1278
1279     The bitwise flag is made like this:
1280     dec    meaning
1281     ---    -------
1282     1    paired read
1283     2    proper pair
1284     4    query unmapped
1285     8    mate unmapped
1286     16    strand of the query (1 -> reverse)
1287     32    strand of the mate
1288     64    first read in pair
1289     128    second read in pair
1290     256    alignment is not primary
1291     512    does not pass quality check
1292     1024    PCR or optical duplicate
1293     2048    supplementary alignment
1294     """
1295     cdef public int32_t n           # total number of fragments
1296     cdef public float32_t d         # the average length of fragments
1297
1298     cpdef object build_petrack ( self ):
1299         """Build PETrackI from all lines, return a FWTrack object.
1300         """
1301         cdef:
1302             int64_t i          # number of fragments kept
1303             int64_t m          # sum of fragment lengths
1304             int32_t entrylength, fpos, chrid, tlen
1305             list references
1306             dict rlengths
1307
1308         i = 0
1309         m = 0
1310
1311         petrack = PETrackI( buffer_size = self.buffer_size )
1312         references, rlengths = self.get_references()
1313         fseek = self.fhd.seek
1314         fread = self.fhd.read
1315         ftell = self.fhd.tell
1316
1317         # for convenience, only count valid pairs
1318         add_loc = petrack.add_loc
1319         err = struct.error
1320
1321         while True:
1322             try:
1323                 entrylength = unpack( '<i', fread(4) )[0]
1324             except err:
1325                 #e1 += 1
1326                 break
1327             ( chrid, fpos, tlen ) = bampe_pe_entry_parser( fread(entrylength) )
1328             if chrid == -1:
1329                 #e2 += 1
1330                 continue
1331             add_loc(references[ chrid ], fpos, fpos + tlen)
1332             m += tlen
1333             i += 1
1334             if i % 1000000 == 0:
1335                 info( " %d fragments parsed" % i )
1336
1337         #print( f"{references[chrid]:},{fpos:},{tlen:}" )
1338         info( "%d fragments have been read." % i )
1339         #debug( f" {e1} Can't identify the length of entry, it may be the end of file, stop looping..." )
1340         #debug( f" {e2} Chromosome name can't be found which means this entry is skipped ..." )
1341         #assert i > 0, "Something went wrong, no fragment has been read! Check input file!"
1342         self.d = m / i
1343         self.n = i
1344         #assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
1345         self.fhd.close()
1346         petrack.set_rlengths( rlengths )
1347         return petrack
1348
1349     cpdef append_petrack (self, petrack):
1350         """Build PETrackI from all lines, return a PETrackI object.
1351         """
1352         cdef:
1353             int64_t i = 0          # number of fragments kept
1354             int64_t m = 0          # sum of fragment lengths
1355             int32_t entrylength, fpos, chrid, tlen
1356             list references
1357             dict rlengths
1358
1359         references, rlengths = self.get_references()
1360         fseek = self.fhd.seek
1361         fread = self.fhd.read
1362         ftell = self.fhd.tell
1363
1364         # for convenience, only count valid pairs
1365         add_loc = petrack.add_loc
1366         err = struct.error
1367
1368         while True:
1369             try:
1370                 entrylength = unpack('<i', fread(4))[0]
1371             except err:
1372                 break
1373             ( chrid, fpos, tlen ) = bampe_pe_entry_parser( fread(entrylength) )
1374             if chrid == -1:
1375                 continue
1376             add_loc(references[ chrid ], fpos, fpos + tlen)
1377             m += tlen
1378             i += 1
1379             if i % 1000000 == 0:
1380                 info(" %d fragments parsed" % i)
1381
1382         info( "%d fragments have been read." % i )
1383         self.d = ( self.d * self.n + m ) / ( self.n + i )
1384         self.n += i
1385         #assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
1386         self.fhd.close()
1387         # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
1388         # petrack.finalize()
1389         petrack.set_rlengths( rlengths )
1390         return petrack
1391
1392 cdef class BowtieParser( GenericParser ):
1393     """File Parser Class for map files from Bowtie or MAQ's maqview
1394     program.
1395
1396     """
1397     cdef int32_t __tlen_parse_line ( self, bytes thisline ):
1398         """Parse tag sequence, then tag length.
1399
1400         """
1401         cdef list thisfields
1402
1403         thisline = thisline.rstrip()
1404         if not thisline: return ( b"", -1, -1 )
1405         if thisline[ 0 ] == b"#": return ( b"", -1 , -1 ) # comment line is skipped
1406         thisfields = thisline.split( b'\t' ) # I hope it will never bring me more trouble
1407         return len( thisfields[ 4 ] )
1408
1409     cdef tuple __fw_parse_line (self, bytes thisline ):
1410         """
1411         The following definition comes from bowtie website:
1412
1413         The bowtie aligner outputs each alignment on a separate
1414         line. Each line is a collection of 8 fields separated by tabs;
1415         from left to right, the fields are:
1416
1417         1. Name of read that aligned
1418
1419         2. Orientation of read in the alignment, - for reverse
1420         complement, + otherwise
1421
1422         3. Name of reference sequence where alignment occurs, or
1423         ordinal ID if no name was provided
1424
1425         4. 0-based offset into the forward reference strand where
1426         leftmost character of the alignment occurs
1427
1428         5. Read sequence (reverse-complemented if orientation is -)
1429
1430         6. ASCII-encoded read qualities (reversed if orientation is
1431         -). The encoded quality values are on the Phred scale and the
1432         encoding is ASCII-offset by 33 (ASCII char !).
1433
1434         7. Number of other instances where the same read aligns
1435         against the same reference characters as were aligned against
1436         in this alignment. This is not the number of other places the
1437         read aligns with the same number of mismatches. The number in
1438         this column is generally not a good proxy for that number
1439         (e.g., the number in this column may be '0' while the number
1440         of other alignments with the same number of mismatches might
1441         be large). This column was previously described as 'Reserved'.
1442
1443         8. Comma-separated list of mismatch descriptors. If there are
1444         no mismatches in the alignment, this field is empty. A single
1445         descriptor has the format offset:reference-base>read-base. The
1446         offset is expressed as a 0-based offset from the high-quality
1447         (5') end of the read.
1448
1449         """
1450         cdef:
1451             list thisfields
1452             bytes chromname
1453
1454         thisline = thisline.rstrip()
1455         if not thisline: return ( b"", -1, -1 )
1456         if thisline[ 0 ] == b"#": return ( b"", -1, -1 ) # comment line is skipped
1457         thisfields = thisline.split( b'\t' ) # I hope it will never bring me more trouble
1458
1459         chromname = thisfields[ 2 ]
1460         try:
1461             chromname = chromname[ :chromname.rindex( b".fa" ) ]
1462         except ValueError:
1463             pass
1464
1465             if thisfields[ 1 ] == b"+":
1466                 return ( chromname,
1467                          atoi( thisfields[ 3 ] ),
1468                          0 )
1469             elif thisfields[ 1 ] == b"-":
1470                 return ( chromname,
1471                          atoi( thisfields[ 3 ] ) + strlen( thisfields[ 4 ] ),
1472                          1 )
1473             else:
1474                 raise StrandFormatError( thisline, thisfields[ 1 ] )
1475