Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import __builtin__
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def U32(i):
  19     """Return i as an unsigned integer, assuming it fits in 32 bits.
  20
  21     If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
  22     """
  23     if i < 0:
  24         i += 1L << 32
  25     return i
  26
  27 def LOWU32(i):
  28     """Return the low-order 32 bits of an int, as a non-negative int."""
  29     return i & 0xFFFFFFFFL
  30
  31 def write32(output, value):
  32     output.write(struct.pack("<l", value))
  33
  34 def write32u(output, value):
  35     # The L format writes the bit pattern correctly whether signed
  36     # or unsigned.
  37     output.write(struct.pack("<L", value))
  38
  39 def read32(input):
  40     return struct.unpack("<l", input.read(4))[0]
  41
  42 def open(filename, mode="rb", compresslevel=9):
  43     """Shorthand for GzipFile(filename, mode, compresslevel).
  44
  45     The filename argument is required; mode defaults to 'rb'
  46     and compresslevel defaults to 9.
  47
  48     """
  49     return GzipFile(filename, mode, compresslevel)
  50
  51 class GzipFile:
  52     """The GzipFile class simulates most of the methods of a file object with
  53     the exception of the readinto() and truncate() methods.
  54
  55     """
  56
  57     myfileobj = None
  58
  59     def __init__(self, filename=None, mode=None,
  60                  compresslevel=9, fileobj=None):
  61         """Constructor for the GzipFile class.
  62
  63         At least one of fileobj and filename must be given a
  64         non-trivial value.
  65
  66         The new class instance is based on fileobj, which can be a regular
  67         file, a StringIO object, or any other object which simulates a file.
  68         It defaults to None, in which case filename is opened to provide
  69         a file object.
  70
  71         When fileobj is not None, the filename argument is only used to be
  72         included in the gzip file header, which may includes the original
  73         filename of the uncompressed file.  It defaults to the filename of
  74         fileobj, if discernible; otherwise, it defaults to the empty string,
  75         and in this case the original filename is not included in the header.
  76
  77         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  78         depending on whether the file will be read or written.  The default
  79         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  80         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  81         for cross-platform portability.
  82
  83         The compresslevel argument is an integer from 1 to 9 controlling the
  84         level of compression; 1 is fastest and produces the least compression,
  85         and 9 is slowest and produces the most compression.  The default is 9.
  86
  87         """
  88
  89         # guarantee the file is opened in binary mode on platforms
  90         # that care about that sort of thing
  91         if mode and 'b' not in mode:
  92             mode += 'b'
  93         if fileobj is None:
  94             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  95         if filename is None:
  96             if hasattr(fileobj, 'name'): filename = fileobj.name
  97             else: filename = ''
  98         if mode is None:
  99             if hasattr(fileobj, 'mode'): mode = fileobj.mode
 100             else: mode = 'rb'
 101
 102         if mode[0:1] == 'r':
 103             self.mode = READ
 104             # Set flag indicating start of a new member
 105             self._new_member = True
 106             self.extrabuf = ""
 107             self.extrasize = 0
 108             self.filename = filename
 109
 110         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 111             self.mode = WRITE
 112             self._init_write(filename)
 113             self.compress = zlib.compressobj(compresslevel,
 114                                              zlib.DEFLATED,
 115                                              -zlib.MAX_WBITS,
 116                                              zlib.DEF_MEM_LEVEL,
 117                                              0)
 118         else:
 119             raise IOError, "Mode " + mode + " not supported"
 120
 121         self.fileobj = fileobj
 122         self.offset = 0
 123
 124         if self.mode == WRITE:
 125             self._write_gzip_header()
 126
 127     def __repr__(self):
 128         s = repr(self.fileobj)
 129         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 130
 131     def _init_write(self, filename):
 132         if filename[-3:] != '.gz':
 133             filename = filename + '.gz'
 134         self.filename = filename
 135         self.crc = zlib.crc32("")
 136         self.size = 0
 137         self.writebuf = []
 138         self.bufsize = 0
 139
 140     def _write_gzip_header(self):
 141         self.fileobj.write('\037\213')             # magic header
 142         self.fileobj.write('\010')                 # compression method
 143         fname = self.filename[:-3]
 144         flags = 0
 145         if fname:
 146             flags = FNAME
 147         self.fileobj.write(chr(flags))
 148         write32u(self.fileobj, long(time.time()))
 149         self.fileobj.write('\002')
 150         self.fileobj.write('\377')
 151         if fname:
 152             self.fileobj.write(fname + '\000')
 153
 154     def _init_read(self):
 155         self.crc = zlib.crc32("")
 156         self.size = 0
 157
 158     def _read_gzip_header(self):
 159         magic = self.fileobj.read(2)
 160         if magic != '\037\213':
 161             raise IOError, 'Not a gzipped file'
 162         method = ord( self.fileobj.read(1) )
 163         if method != 8:
 164             raise IOError, 'Unknown compression method'
 165         flag = ord( self.fileobj.read(1) )
 166         # modtime = self.fileobj.read(4)
 167         # extraflag = self.fileobj.read(1)
 168         # os = self.fileobj.read(1)
 169         self.fileobj.read(6)
 170
 171         if flag & FEXTRA:
 172             # Read & discard the extra field, if present
 173             xlen = ord(self.fileobj.read(1))
 174             xlen = xlen + 256*ord(self.fileobj.read(1))
 175             self.fileobj.read(xlen)
 176         if flag & FNAME:
 177             # Read and discard a null-terminated string containing the filename
 178             while True:
 179                 s = self.fileobj.read(1)
 180                 if not s or s=='\000':
 181                     break
 182         if flag & FCOMMENT:
 183             # Read and discard a null-terminated string containing a comment
 184             while True:
 185                 s = self.fileobj.read(1)
 186                 if not s or s=='\000':
 187                     break
 188         if flag & FHCRC:
 189             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 190
 191
 192     def write(self,data):
 193         if self.mode != WRITE:
 194             import errno
 195             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 196
 197         if self.fileobj is None:
 198             raise ValueError, "write() on closed GzipFile object"
 199         if len(data) > 0:
 200             self.size = self.size + len(data)
 201             self.crc = zlib.crc32(data, self.crc)
 202             self.fileobj.write( self.compress.compress(data) )
 203             self.offset += len(data)
 204
 205     def read(self, size=-1):
 206         if self.mode != READ:
 207             import errno
 208             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 209
 210         if self.extrasize <= 0 and self.fileobj is None:
 211             return ''
 212
 213         readsize = 1024
 214         if size < 0:        # get the whole thing
 215             try:
 216                 while True:
 217                     self._read(readsize)
 218                     readsize = readsize * 2
 219             except EOFError:
 220                 size = self.extrasize
 221         else:               # just get some more of it
 222             try:
 223                 while size > self.extrasize:
 224                     self._read(readsize)
 225                     readsize = readsize * 2
 226             except EOFError:
 227                 if size > self.extrasize:
 228                     size = self.extrasize
 229
 230         chunk = self.extrabuf[:size]
 231         self.extrabuf = self.extrabuf[size:]
 232         self.extrasize = self.extrasize - size
 233
 234         self.offset += size
 235         return chunk
 236
 237     def _unread(self, buf):
 238         self.extrabuf = buf + self.extrabuf
 239         self.extrasize = len(buf) + self.extrasize
 240         self.offset -= len(buf)
 241
 242     def _read(self, size=1024):
 243         if self.fileobj is None:
 244             raise EOFError, "Reached EOF"
 245
 246         if self._new_member:
 247             # If the _new_member flag is set, we have to
 248             # jump to the next member, if there is one.
 249             #
 250             # First, check if we're at the end of the file;
 251             # if so, it's time to stop; no more members to read.
 252             pos = self.fileobj.tell()   # Save current position
 253             self.fileobj.seek(0, 2)     # Seek to end of file
 254             if pos == self.fileobj.tell():
 255                 raise EOFError, "Reached EOF"
 256             else:
 257                 self.fileobj.seek( pos ) # Return to original position
 258
 259             self._init_read()
 260             self._read_gzip_header()
 261             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 262             self._new_member = False
 263
 264         # Read a chunk of data from the file
 265         buf = self.fileobj.read(size)
 266
 267         # If the EOF has been reached, flush the decompression object
 268         # and mark this object as finished.
 269
 270         if buf == "":
 271             uncompress = self.decompress.flush()
 272             self._read_eof()
 273             self._add_read_data( uncompress )
 274             raise EOFError, 'Reached EOF'
 275
 276         uncompress = self.decompress.decompress(buf)
 277         self._add_read_data( uncompress )
 278
 279         if self.decompress.unused_data != "":
 280             # Ending case: we've come to the end of a member in the file,
 281             # so seek back to the start of the unused data, finish up
 282             # this member, and read a new gzip header.
 283             # (The number of bytes to seek back is the length of the unused
 284             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 285             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 286
 287             # Check the CRC and file size, and set the flag so we read
 288             # a new member on the next call
 289             self._read_eof()
 290             self._new_member = True
 291
 292     def _add_read_data(self, data):
 293         self.crc = zlib.crc32(data, self.crc)
 294         self.extrabuf = self.extrabuf + data
 295         self.extrasize = self.extrasize + len(data)
 296         self.size = self.size + len(data)
 297
 298     def _read_eof(self):
 299         # We've read to the end of the file, so we have to rewind in order
 300         # to reread the 8 bytes containing the CRC and the file size.
 301         # We check the that the computed CRC and size of the
 302         # uncompressed data matches the stored values.  Note that the size
 303         # stored is the true file size mod 2**32.
 304         self.fileobj.seek(-8, 1)
 305         crc32 = read32(self.fileobj)
 306         isize = U32(read32(self.fileobj))   # may exceed 2GB
 307         if U32(crc32) != U32(self.crc):
 308             raise IOError, "CRC check failed"
 309         elif isize != LOWU32(self.size):
 310             raise IOError, "Incorrect length of data produced"
 311
 312     def close(self):
 313         if self.mode == WRITE:
 314             self.fileobj.write(self.compress.flush())
 315             write32(self.fileobj, self.crc)
 316             # self.size may exceed 2GB, or even 4GB
 317             write32u(self.fileobj, LOWU32(self.size))
 318             self.fileobj = None
 319         elif self.mode == READ:
 320             self.fileobj = None
 321         if self.myfileobj:
 322             self.myfileobj.close()
 323             self.myfileobj = None
 324
 325     def __del__(self):
 326         try:
 327             if (self.myfileobj is None and
 328                 self.fileobj is None):
 329                 return
 330         except AttributeError:
 331             return
 332         self.close()
 333
 334     def flush(self):
 335         self.fileobj.flush()
 336
 337     def isatty(self):
 338         return False
 339
 340     def tell(self):
 341         return self.offset
 342
 343     def rewind(self):
 344         '''Return the uncompressed stream file position indicator to the
 345         beginning of the file'''
 346         if self.mode != READ:
 347             raise IOError("Can't rewind in write mode")
 348         self.fileobj.seek(0)
 349         self._new_member = True
 350         self.extrabuf = ""
 351         self.extrasize = 0
 352         self.offset = 0
 353
 354     def seek(self, offset):
 355         if self.mode == WRITE:
 356             if offset < self.offset:
 357                 raise IOError('Negative seek in write mode')
 358             count = offset - self.offset
 359             for i in range(count // 1024):
 360                 self.write(1024 * '\0')
 361             self.write((count % 1024) * '\0')
 362         elif self.mode == READ:
 363             if offset < self.offset:
 364                 # for negative seek, rewind and do positive seek
 365                 self.rewind()
 366             count = offset - self.offset
 367             for i in range(count // 1024):
 368                 self.read(1024)
 369             self.read(count % 1024)
 370
 371     def readline(self, size=-1):
 372         if size < 0: size = sys.maxint
 373         bufs = []
 374         readsize = min(100, size)    # Read from the file in small chunks
 375         while True:
 376             if size == 0:
 377                 return "".join(bufs) # Return resulting line
 378
 379             c = self.read(readsize)
 380             i = c.find('\n')
 381             if size is not None:
 382                 # We set i=size to break out of the loop under two
 383                 # conditions: 1) there's no newline, and the chunk is
 384                 # larger than size, or 2) there is a newline, but the
 385                 # resulting line would be longer than 'size'.
 386                 if i==-1 and len(c) > size: i=size-1
 387                 elif size <= i: i = size -1
 388
 389             if i >= 0 or c == '':
 390                 bufs.append(c[:i+1])    # Add portion of last chunk
 391                 self._unread(c[i+1:])   # Push back rest of chunk
 392                 return ''.join(bufs)    # Return resulting line
 393
 394             # Append chunk to list, decrease 'size',
 395             bufs.append(c)
 396             size = size - len(c)
 397             readsize = min(size, readsize * 2)
 398
 399     def readlines(self, sizehint=0):
 400         # Negative numbers result in reading all the lines
 401         if sizehint <= 0:
 402             sizehint = sys.maxint
 403         L = []
 404         while sizehint > 0:
 405             line = self.readline()
 406             if line == "":
 407                 break
 408             L.append(line)
 409             sizehint = sizehint - len(line)
 410
 411         return L
 412
 413     def writelines(self, L):
 414         for line in L:
 415             self.write(line)
 416
 417     def __iter__(self):
 418         return self
 419
 420     def next(self):
 421         line = self.readline()
 422         if line:
 423             return line
 424         else:
 425             raise StopIteration
 426
 427
 428 def _test():
 429     # Act like gzip; with -d, act like gunzip.
 430     # The input file is not deleted, however, nor are any other gzip
 431     # options or features supported.
 432     args = sys.argv[1:]
 433     decompress = args and args[0] == "-d"
 434     if decompress:
 435         args = args[1:]
 436     if not args:
 437         args = ["-"]
 438     for arg in args:
 439         if decompress:
 440             if arg == "-":
 441                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 442                 g = sys.stdout
 443             else:
 444                 if arg[-3:] != ".gz":
 445                     print "filename doesn't end in .gz:", `arg`
 446                     continue
 447                 f = open(arg, "rb")
 448                 g = __builtin__.open(arg[:-3], "wb")
 449         else:
 450             if arg == "-":
 451                 f = sys.stdin
 452                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 453             else:
 454                 f = __builtin__.open(arg, "rb")
 455                 g = open(arg + ".gz", "wb")
 456         while True:
 457             chunk = f.read(1024)
 458             if not chunk:
 459                 break
 460             g.write(chunk)
 461         if g is not sys.stdout:
 462             g.close()
 463         if f is not sys.stdin:
 464             f.close()
 465
 466 if __name__ == '__main__':
 467     _test()