Lib/gzip.py

   1 import time
   2 import string
   3 import zlib
   4 import __builtin__
   5
   6 # implements a python function that reads and writes a gzipped file
   7 # the user of the file doesn't have to worry about the compression,
   8 # but random access is not allowed
   9
  10 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
  11
  12 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  13
  14 READ, WRITE = 1, 2
  15
  16 def write32(output, value):
  17     t = divmod(value, 256)
  18     b1 = chr(t[1])
  19
  20     t = divmod(t[0], 256)
  21     b2 = chr(t[1])
  22
  23     t = divmod(t[0], 256)
  24     b3 = chr(t[1])
  25
  26     t = divmod(t[0], 256)
  27     b4 = chr(t[1])
  28
  29     buf = b1 + b2 + b3 + b4
  30     output.write(buf)
  31
  32
  33 def read32(input):
  34     buf = input.read(4)
  35     v = ord(buf[0])
  36     v = v + (ord(buf[1]) << 8)
  37     v = v + (ord(buf[2]) << 16)
  38     v = v + (ord(buf[3]) << 24)
  39     return v
  40
  41 def open(filename, mode="r", compresslevel=9):
  42     return GzipFile(filename, mode, compresslevel)
  43
  44 class GzipFile:
  45
  46     myfileobj = None
  47
  48     def __init__(self, filename=None, mode=None,
  49                  compresslevel=9, fileobj=None):
  50         if fileobj is None:
  51             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'r')
  52         if filename is None:
  53             if hasattr(fileobj, 'name'): filename = fileobj.name
  54             else: filename = ''
  55         if mode is None:
  56             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  57             else: mode = 'r'
  58
  59         if mode[0:1] == 'r':
  60             self.mode = READ
  61             self._init_read()
  62             self.filename = filename
  63             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
  64
  65         elif mode[0:1] == 'w':
  66             self.mode = WRITE
  67             self._init_write(filename)
  68             self.compress = zlib.compressobj(compresslevel,
  69                                              zlib.DEFLATED,
  70                                              -zlib.MAX_WBITS,
  71                                              zlib.DEF_MEM_LEVEL,
  72                                              0)
  73         else:
  74             raise ValueError, "Mode " + mode + " not supported"
  75
  76         self.fileobj = fileobj
  77
  78         if self.mode == WRITE:
  79             self._write_gzip_header()
  80         elif self.mode == READ:
  81             self._read_gzip_header()
  82
  83     def __repr__(self):
  84         s = repr(self.fileobj)
  85         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
  86
  87     def _init_write(self, filename):
  88         if filename[-3:] != '.gz':
  89             filename = filename + '.gz'
  90         self.filename = filename
  91         self.crc = zlib.crc32("")
  92         self.size = 0
  93         self.writebuf = []
  94         self.bufsize = 0
  95
  96     def _write_gzip_header(self):
  97         self.fileobj.write('\037\213')             # magic header
  98         self.fileobj.write('\010')                 # compression method
  99         fname = self.filename[:-3]
 100         flags = 0
 101         if fname:
 102             flags = FNAME
 103         self.fileobj.write(chr(flags))
 104         write32(self.fileobj, int(time.time()))
 105         self.fileobj.write('\002')
 106         self.fileobj.write('\377')
 107         if fname:
 108             self.fileobj.write(fname + '\000')
 109
 110     def _init_read(self):
 111         self.crc = zlib.crc32("")
 112         self.size = 0
 113         self.extrabuf = ""
 114         self.extrasize = 0
 115
 116     def _read_gzip_header(self):
 117         magic = self.fileobj.read(2)
 118         if magic != '\037\213':
 119             raise RuntimeError, 'Not a gzipped file'
 120         method = ord( self.fileobj.read(1) )
 121         if method != 8:
 122             raise RuntimeError, 'Unknown compression method'
 123         flag = ord( self.fileobj.read(1) )
 124         # modtime = self.fileobj.read(4)
 125         # extraflag = self.fileobj.read(1)
 126         # os = self.fileobj.read(1)
 127         self.fileobj.read(6)
 128
 129         if flag & FEXTRA:
 130             # Read & discard the extra field, if present
 131             xlen=ord(self.fileobj.read(1))
 132             xlen=xlen+256*ord(self.fileobj.read(1))
 133             self.fileobj.read(xlen)
 134         if flag & FNAME:
 135             # Read and discard a null-terminated string containing the filename
 136             while (1):
 137                 s=self.fileobj.read(1)
 138                 if not s or s=='\000': break
 139         if flag & FCOMMENT:
 140             # Read and discard a null-terminated string containing a comment
 141             while (1):
 142                 s=self.fileobj.read(1)
 143                 if not s or s=='\000': break
 144         if flag & FHCRC:
 145             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 146
 147
 148     def write(self,data):
 149         if self.fileobj is None:
 150             raise ValueError, "write() on closed GzipFile object"
 151         if len(data) > 0:
 152             self.size = self.size + len(data)
 153             self.crc = zlib.crc32(data, self.crc)
 154             self.fileobj.write( self.compress.compress(data) )
 155
 156     def writelines(self,lines):
 157         self.write(string.join(lines))
 158
 159     def read(self, size=None):
 160         if self.extrasize <= 0 and self.fileobj is None:
 161             return ''
 162
 163         readsize = 1024
 164         if not size:        # get the whole thing
 165             try:
 166                 while 1:
 167                     self._read(readsize)
 168                     readsize = readsize * 2
 169             except EOFError:
 170                 size = self.extrasize
 171         else:               # just get some more of it
 172             try:
 173                 while size > self.extrasize:
 174                     self._read(readsize)
 175                     readsize = readsize * 2
 176             except EOFError:
 177                 if size > self.extrasize:
 178                     size = self.extrasize
 179
 180         chunk = self.extrabuf[:size]
 181         self.extrabuf = self.extrabuf[size:]
 182         self.extrasize = self.extrasize - size
 183
 184         return chunk
 185
 186     def _unread(self, buf):
 187         self.extrabuf = buf + self.extrabuf
 188         self.extrasize = len(buf) + self.extrasize
 189
 190     def _read(self, size=1024):
 191         try:
 192             buf = self.fileobj.read(size)
 193         except AttributeError:
 194             raise EOFError, "Reached EOF"
 195         if buf == "":
 196             uncompress = self.decompress.flush()
 197             if uncompress == "":
 198                 self._read_eof()
 199                 self.fileobj = None
 200                 raise EOFError, 'Reached EOF'
 201         else:
 202             uncompress = self.decompress.decompress(buf)
 203         self.crc = zlib.crc32(uncompress, self.crc)
 204         self.extrabuf = self.extrabuf + uncompress
 205         self.extrasize = self.extrasize + len(uncompress)
 206         self.size = self.size + len(uncompress)
 207
 208     def _read_eof(self):
 209         # Andrew writes:
 210         ## We've read to the end of the file, so we have to rewind in order
 211         ## to reread the 8 bytes containing the CRC and the file size.  The
 212         ## decompressor is smart and knows when to stop, so feeding it
 213         ## extra data is harmless.
 214         self.fileobj.seek(-8, 2)
 215         crc32 = read32(self.fileobj)
 216         isize = read32(self.fileobj)
 217         if crc32 != self.crc:
 218             self.error = "CRC check failed"
 219         elif isize != self.size:
 220             self.error = "Incorrect length of data produced"
 221
 222     def close(self):
 223         if self.mode == WRITE:
 224             self.fileobj.write(self.compress.flush())
 225             write32(self.fileobj, self.crc)
 226             write32(self.fileobj, self.size)
 227             self.fileobj = None
 228         elif self.mode == READ:
 229             self.fileobj = None
 230         if self.myfileobj:
 231             self.myfileobj.close()
 232             self.myfileobj = None
 233
 234     def flush(self):
 235         self.fileobj.flush()
 236
 237     def seek(self):
 238         raise IOError, 'Random access not allowed in gzip files'
 239
 240     def tell(self):
 241         raise IOError, 'I won\'t tell() you for gzip files'
 242
 243     def isatty(self):
 244         return 0
 245
 246     def readline(self):
 247         bufs = []
 248         readsize = 100
 249         while 1:
 250             c = self.read(readsize)
 251             i = string.find(c, '\n')
 252             if i >= 0 or c == '':
 253                 bufs.append(c[:i+1])
 254                 self._unread(c[i+1:])
 255                 return string.join(bufs, '')
 256             bufs.append(c)
 257             readsize = readsize * 2
 258
 259     def readlines(self):
 260         buf = self.read()
 261         lines = string.split(buf, '\n')
 262         for i in range(len(lines)-1):
 263             lines[i] = lines[i] + '\n'
 264         if lines and not lines[-1]:
 265             del lines[-1]
 266         return lines
 267
 268     def writelines(self, L):
 269         for line in L:
 270             self.write(line)
 271
 272
 273 def _test():
 274     # Act like gzip; with -d, act like gunzip.
 275     # The input file is not deleted, however, nor are any other gzip
 276     # options or features supported.
 277     import sys
 278     args = sys.argv[1:]
 279     decompress = args and args[0] == "-d"
 280     if decompress:
 281         args = args[1:]
 282     if not args:
 283         args = ["-"]
 284     for arg in args:
 285         if decompress:
 286             if arg == "-":
 287                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 288                 g = sys.stdout
 289             else:
 290                 if arg[-3:] != ".gz":
 291                     print "filename doesn't end in .gz:", `arg`
 292                     continue
 293                 f = open(arg, "rb")
 294                 g = __builtin__.open(arg[:-3], "wb")
 295         else:
 296             if arg == "-":
 297                 f = sys.stdin
 298                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 299             else:
 300                 f = __builtin__.open(arg, "rb")
 301                 g = open(arg + ".gz", "wb")
 302         while 1:
 303             chunk = f.read(1024)
 304             if not chunk:
 305                 break
 306             g.write(chunk)
 307         if g is not sys.stdout:
 308             g.close()
 309         if f is not sys.stdin:
 310             f.close()
 311
 312 if __name__ == '__main__':
 313     _test()