reader.py

   1 # -*- encoding: utf-8 -*-
   2 #
   3 #
   4 # Copyright (C) 2007-2011 Jörg Lehmann <joergl@users.sourceforge.net>
   5 # Copyright (C) 2007-2011 André Wobst <wobsta@users.sourceforge.net>
   6 #
   7 # This file is part of PyX (http://pyx.sourceforge.net/).
   8 #
   9 # PyX is free software; you can redistribute it and/or modify
  10 # it under the terms of the GNU General Public License as published by
  11 # the Free Software Foundation; either version 2 of the License, or
  12 # (at your option) any later version.
  13 #
  14 # PyX is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details.
  18 #
  19 # You should have received a copy of the GNU General Public License
  20 # along with PyX; if not, write to the Free Software
  21 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  22
  23
  24 import io, struct
  25
  26
  27 class reader:
  28
  29     def __init__(self, filename):
  30         self.file = open(filename, "rb")
  31
  32     def close(self):
  33         self.file.close()
  34
  35     def tell(self):
  36         return self.file.tell()
  37
  38     def eof(self):
  39         return self.file.eof()
  40
  41     def read(self, bytes):
  42         return self.file.read(bytes)
  43
  44     def readint(self, bytes=4, signed=0):
  45         first = 1
  46         result = 0
  47         while bytes:
  48             value = ord(self.file.read(1))
  49             if first and signed and value > 127:
  50                 value -= 256
  51             first = 0
  52             result = 256 * result + value
  53             bytes -= 1
  54         return result
  55
  56     def readint32(self):
  57         return struct.unpack(">l", self.file.read(4))[0]
  58
  59     def readuint32(self):
  60         return struct.unpack(">L", self.file.read(4))[0]
  61
  62     def readint24(self):
  63         return struct.unpack(">l", b"\0"+self.file.read(3))[0]
  64
  65     def readuint24(self):
  66         return struct.unpack(">L", b"\0"+self.file.read(3))[0]
  67
  68     def readint16(self):
  69         return struct.unpack(">h", self.file.read(2))[0]
  70
  71     def readuint16(self):
  72         return struct.unpack(">H", self.file.read(2))[0]
  73
  74     def readchar(self):
  75         return struct.unpack("b", self.file.read(1))[0]
  76
  77     def readuchar(self):
  78         return struct.unpack("B", self.file.read(1))[0]
  79
  80     def readstring(self, bytes):
  81         l = self.readuchar()
  82         assert l <= bytes-1, "inconsistency in file: string too long"
  83         return self.file.read(bytes-1)[:l]
  84
  85
  86 class bytesreader(reader):
  87
  88     def __init__(self, b):
  89         self.file = io.BytesIO(b)
  90
  91
  92 class PStokenizer:
  93     """cursor to read a string token by token"""
  94
  95     def __init__(self, data, startstring=None, eattokensep=1,
  96                  tokenseps=" \t\r\n", tokenstarts="()<>[]{}/%",
  97                  commentchar="%", newlinechars="\r\n"):
  98         """creates a cursor for the string data
  99
 100         startstring is a string at which the cursor should start at. The first
 101         ocurance of startstring is used. When startstring is not in data, an
 102         exception is raised, otherwise the cursor is set to the position right
 103         after the startstring. When eattokenseps is set, startstring must be
 104         followed by a tokensep and this first tokensep is also consumed.
 105         tokenseps is a string containing characters to be used as token
 106         separators. tokenstarts is a string containing characters which
 107         directly (even without intermediate token separator) start a new token.
 108         """
 109         self.data = data
 110         if startstring is not None:
 111             self.pos = self.data.index(startstring) + len(startstring)
 112         else:
 113             self.pos = 0
 114         self.tokenseps = tokenseps
 115         self.tokenstarts = tokenstarts
 116         self.commentchar = commentchar
 117         self.newlinechars = newlinechars
 118         if eattokensep:
 119             if self.data[self.pos] not in self.tokenstarts:
 120                 if self.data[self.pos] not in self.tokenseps:
 121                     raise ValueError("cursor initialization string is not followed by a token separator")
 122                 self.pos += 1
 123
 124     def gettoken(self):
 125         """get the next token
 126
 127         Leading token separators and comments are silently consumed. The first token
 128         separator after the token is also silently consumed."""
 129         while self.data[self.pos] in self.tokenseps:
 130             self.pos += 1
 131         # ignore comments including subsequent whitespace characters
 132         while self.data[self.pos] == self.commentchar:
 133             while self.data[self.pos] not in self.newlinechars:
 134                 self.pos += 1
 135             while self.data[self.pos] in self.tokenseps:
 136                 self.pos += 1
 137         startpos = self.pos
 138         while self.data[self.pos] not in self.tokenseps:
 139             # any character in self.tokenstarts ends the token
 140             if self.pos>startpos and self.data[self.pos] in self.tokenstarts:
 141                 break
 142             self.pos += 1
 143         result = self.data[startpos:self.pos]
 144         if self.data[self.pos] in self.tokenseps:
 145             self.pos += 1 # consume a single tokensep
 146         return result
 147
 148     def getint(self):
 149         """get the next token as an integer"""
 150         return int(self.gettoken())
 151
 152     def getbytes(self, count):
 153         """get the next count bytes"""
 154         startpos = self.pos
 155         self.pos += count
 156         return self.data[startpos: self.pos]
 157
 158
 159
 160 class PSbytes_tokenizer(PStokenizer):
 161
 162     def __init__(self, data, startstring=None, eattokensep=1,
 163                  tokenseps=b" \t\r\n", tokenstarts=b"()<>[]{}/%",
 164                  commentchar=b"%", newlinechars=b"\r\n"):
 165         super().__init__(data, startstring=startstring, eattokensep=eattokensep,
 166                          tokenseps=tokenseps, tokenstarts=tokenstarts,
 167                          commentchar=commentchar, newlinechars=newlinechars)