reader.py

   1 # -*- encoding: utf-8 -*-
   2 #
   3 #
   4 # Copyright (C) 2007-2011 Jörg Lehmann <joergl@users.sourceforge.net>
   5 # Copyright (C) 2007-2011 André Wobst <wobsta@users.sourceforge.net>
   6 #
   7 # This file is part of PyX (http://pyx.sourceforge.net/).
   8 #
   9 # PyX is free software; you can redistribute it and/or modify
  10 # it under the terms of the GNU General Public License as published by
  11 # the Free Software Foundation; either version 2 of the License, or
  12 # (at your option) any later version.
  13 #
  14 # PyX is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details.
  18 #
  19 # You should have received a copy of the GNU General Public License
  20 # along with PyX; if not, write to the Free Software
  21 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  22
  23
  24 import io, struct
  25
  26
  27 class reader:
  28
  29     def __init__(self, filename):
  30         self.file = open(filename, "rb")
  31
  32     def tell(self):
  33         return self.file.tell()
  34
  35     def eof(self):
  36         return self.file.eof()
  37
  38     def read(self, bytes):
  39         return self.file.read(bytes)
  40
  41     def readint(self, bytes=4, signed=0):
  42         first = 1
  43         result = 0
  44         while bytes:
  45             value = ord(self.file.read(1))
  46             if first and signed and value > 127:
  47                 value -= 256
  48             first = 0
  49             result = 256 * result + value
  50             bytes -= 1
  51         return result
  52
  53     def readint32(self):
  54         return struct.unpack(">l", self.file.read(4))[0]
  55
  56     def readuint32(self):
  57         return struct.unpack(">L", self.file.read(4))[0]
  58
  59     def readint24(self):
  60         return struct.unpack(">l", b"\0"+self.file.read(3))[0]
  61
  62     def readuint24(self):
  63         return struct.unpack(">L", b"\0"+self.file.read(3))[0]
  64
  65     def readint16(self):
  66         return struct.unpack(">h", self.file.read(2))[0]
  67
  68     def readuint16(self):
  69         return struct.unpack(">H", self.file.read(2))[0]
  70
  71     def readchar(self):
  72         return struct.unpack("b", self.file.read(1))[0]
  73
  74     def readuchar(self):
  75         return struct.unpack("B", self.file.read(1))[0]
  76
  77     def readstring(self, bytes):
  78         l = self.readuchar()
  79         assert l <= bytes-1, "inconsistency in file: string too long"
  80         return self.file.read(bytes-1)[:l]
  81
  82     def close(self):
  83         self.file.close()
  84
  85     def __enter__(self):
  86         return self
  87
  88     def __exit__(self, exc_type, exc_value, traceback):
  89         return self.file.__exit__(exc_type, exc_value, traceback)
  90
  91
  92 class bytesreader(reader):
  93
  94     def __init__(self, b):
  95         self.file = io.BytesIO(b)
  96
  97
  98 class PStokenizer:
  99     """cursor to read a string token by token"""
 100
 101     def __init__(self, data, startstring=None, eattokensep=1,
 102                  tokenseps=" \t\r\n", tokenstarts="()<>[]{}/%",
 103                  commentchar="%", newlinechars="\r\n"):
 104         """creates a cursor for the string data
 105
 106         startstring is a string at which the cursor should start at. The first
 107         ocurance of startstring is used. When startstring is not in data, an
 108         exception is raised, otherwise the cursor is set to the position right
 109         after the startstring. When eattokenseps is set, startstring must be
 110         followed by a tokensep and this first tokensep is also consumed.
 111         tokenseps is a string containing characters to be used as token
 112         separators. tokenstarts is a string containing characters which
 113         directly (even without intermediate token separator) start a new token.
 114         """
 115         self.data = data
 116         if startstring is not None:
 117             self.pos = self.data.index(startstring) + len(startstring)
 118         else:
 119             self.pos = 0
 120         self.tokenseps = tokenseps
 121         self.tokenstarts = tokenstarts
 122         self.commentchar = commentchar
 123         self.newlinechars = newlinechars
 124         if eattokensep:
 125             if self.data[self.pos] not in self.tokenstarts:
 126                 if self.data[self.pos] not in self.tokenseps:
 127                     raise ValueError("cursor initialization string is not followed by a token separator")
 128                 self.pos += 1
 129
 130     def gettoken(self):
 131         """get the next token
 132
 133         Leading token separators and comments are silently consumed. The first token
 134         separator after the token is also silently consumed."""
 135         while self.data[self.pos] in self.tokenseps:
 136             self.pos += 1
 137         # ignore comments including subsequent whitespace characters
 138         while self.data[self.pos] == self.commentchar:
 139             while self.data[self.pos] not in self.newlinechars:
 140                 self.pos += 1
 141             while self.data[self.pos] in self.tokenseps:
 142                 self.pos += 1
 143         startpos = self.pos
 144         while self.data[self.pos] not in self.tokenseps:
 145             # any character in self.tokenstarts ends the token
 146             if self.pos>startpos and self.data[self.pos] in self.tokenstarts:
 147                 break
 148             self.pos += 1
 149         result = self.data[startpos:self.pos]
 150         if self.data[self.pos] in self.tokenseps:
 151             self.pos += 1 # consume a single tokensep
 152         return result
 153
 154     def getint(self):
 155         """get the next token as an integer"""
 156         return int(self.gettoken())
 157
 158     def getbytes(self, count):
 159         """get the next count bytes"""
 160         startpos = self.pos
 161         self.pos += count
 162         return self.data[startpos: self.pos]
 163
 164
 165
 166 class PSbytes_tokenizer(PStokenizer):
 167
 168     def __init__(self, data, startstring=None, eattokensep=1,
 169                  tokenseps=b" \t\r\n", tokenstarts=b"()<>[]{}/%",
 170                  commentchar=b"%", newlinechars=b"\r\n"):
 171         super().__init__(data, startstring=startstring, eattokensep=eattokensep,
 172                          tokenseps=tokenseps, tokenstarts=tokenstarts,
 173                          commentchar=commentchar, newlinechars=newlinechars)