Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import __builtin__
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def write32(output, value):
  19     output.write(struct.pack("<l", value))
  20
  21 def write32u(output, value):
  22     if value < 0:
  23         value = value + 0x100000000L
  24     output.write(struct.pack("<L", value))
  25
  26 def read32(input):
  27     return struct.unpack("<l", input.read(4))[0]
  28
  29 def open(filename, mode="rb", compresslevel=9):
  30     """Shorthand for GzipFile(filename, mode, compresslevel).
  31
  32     The filename argument is required; mode defaults to 'rb'
  33     and compresslevel defaults to 9.
  34
  35     """
  36     return GzipFile(filename, mode, compresslevel)
  37
  38 class GzipFile:
  39     """The GzipFile class simulates most of the methods of a file object with
  40     the exception of the readinto() and truncate() methods.
  41
  42     """
  43
  44     myfileobj = None
  45
  46     def __init__(self, filename=None, mode=None,
  47                  compresslevel=9, fileobj=None):
  48         """Constructor for the GzipFile class.
  49
  50         At least one of fileobj and filename must be given a
  51         non-trivial value.
  52
  53         The new class instance is based on fileobj, which can be a regular
  54         file, a StringIO object, or any other object which simulates a file.
  55         It defaults to None, in which case filename is opened to provide
  56         a file object.
  57
  58         When fileobj is not None, the filename argument is only used to be
  59         included in the gzip file header, which may includes the original
  60         filename of the uncompressed file.  It defaults to the filename of
  61         fileobj, if discernible; otherwise, it defaults to the empty string,
  62         and in this case the original filename is not included in the header.
  63
  64         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  65         depending on whether the file will be read or written.  The default
  66         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  67         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  68         for cross-platform portability.
  69
  70         The compresslevel argument is an integer from 1 to 9 controlling the
  71         level of compression; 1 is fastest and produces the least compression,
  72         and 9 is slowest and produces the most compression.  The default is 9.
  73
  74         """
  75
  76         # guarantee the file is opened in binary mode on platforms
  77         # that care about that sort of thing
  78         if mode and 'b' not in mode:
  79             mode += 'b'
  80         if fileobj is None:
  81             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  82         if filename is None:
  83             if hasattr(fileobj, 'name'): filename = fileobj.name
  84             else: filename = ''
  85         if mode is None:
  86             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  87             else: mode = 'rb'
  88
  89         if mode[0:1] == 'r':
  90             self.mode = READ
  91             # Set flag indicating start of a new member
  92             self._new_member = True
  93             self.extrabuf = ""
  94             self.extrasize = 0
  95             self.filename = filename
  96
  97         elif mode[0:1] == 'w' or mode[0:1] == 'a':
  98             self.mode = WRITE
  99             self._init_write(filename)
 100             self.compress = zlib.compressobj(compresslevel,
 101                                              zlib.DEFLATED,
 102                                              -zlib.MAX_WBITS,
 103                                              zlib.DEF_MEM_LEVEL,
 104                                              0)
 105         else:
 106             raise IOError, "Mode " + mode + " not supported"
 107
 108         self.fileobj = fileobj
 109         self.offset = 0
 110
 111         if self.mode == WRITE:
 112             self._write_gzip_header()
 113
 114     def __repr__(self):
 115         s = repr(self.fileobj)
 116         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 117
 118     def _init_write(self, filename):
 119         if filename[-3:] != '.gz':
 120             filename = filename + '.gz'
 121         self.filename = filename
 122         self.crc = zlib.crc32("")
 123         self.size = 0
 124         self.writebuf = []
 125         self.bufsize = 0
 126
 127     def _write_gzip_header(self):
 128         self.fileobj.write('\037\213')             # magic header
 129         self.fileobj.write('\010')                 # compression method
 130         fname = self.filename[:-3]
 131         flags = 0
 132         if fname:
 133             flags = FNAME
 134         self.fileobj.write(chr(flags))
 135         write32u(self.fileobj, long(time.time()))
 136         self.fileobj.write('\002')
 137         self.fileobj.write('\377')
 138         if fname:
 139             self.fileobj.write(fname + '\000')
 140
 141     def _init_read(self):
 142         self.crc = zlib.crc32("")
 143         self.size = 0
 144
 145     def _read_gzip_header(self):
 146         magic = self.fileobj.read(2)
 147         if magic != '\037\213':
 148             raise IOError, 'Not a gzipped file'
 149         method = ord( self.fileobj.read(1) )
 150         if method != 8:
 151             raise IOError, 'Unknown compression method'
 152         flag = ord( self.fileobj.read(1) )
 153         # modtime = self.fileobj.read(4)
 154         # extraflag = self.fileobj.read(1)
 155         # os = self.fileobj.read(1)
 156         self.fileobj.read(6)
 157
 158         if flag & FEXTRA:
 159             # Read & discard the extra field, if present
 160             xlen=ord(self.fileobj.read(1))
 161             xlen=xlen+256*ord(self.fileobj.read(1))
 162             self.fileobj.read(xlen)
 163         if flag & FNAME:
 164             # Read and discard a null-terminated string containing the filename
 165             while True:
 166                 s=self.fileobj.read(1)
 167                 if not s or s=='\000': break
 168         if flag & FCOMMENT:
 169             # Read and discard a null-terminated string containing a comment
 170             while True:
 171                 s=self.fileobj.read(1)
 172                 if not s or s=='\000': break
 173         if flag & FHCRC:
 174             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 175
 176
 177     def write(self,data):
 178         if self.mode != WRITE:
 179             import errno
 180             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 181
 182         if self.fileobj is None:
 183             raise ValueError, "write() on closed GzipFile object"
 184         if len(data) > 0:
 185             self.size = self.size + len(data)
 186             self.crc = zlib.crc32(data, self.crc)
 187             self.fileobj.write( self.compress.compress(data) )
 188             self.offset += len(data)
 189
 190     def read(self, size=-1):
 191         if self.mode != READ:
 192             import errno
 193             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 194
 195         if self.extrasize <= 0 and self.fileobj is None:
 196             return ''
 197
 198         readsize = 1024
 199         if size < 0:        # get the whole thing
 200             try:
 201                 while True:
 202                     self._read(readsize)
 203                     readsize = readsize * 2
 204             except EOFError:
 205                 size = self.extrasize
 206         else:               # just get some more of it
 207             try:
 208                 while size > self.extrasize:
 209                     self._read(readsize)
 210                     readsize = readsize * 2
 211             except EOFError:
 212                 if size > self.extrasize:
 213                     size = self.extrasize
 214
 215         chunk = self.extrabuf[:size]
 216         self.extrabuf = self.extrabuf[size:]
 217         self.extrasize = self.extrasize - size
 218
 219         self.offset += size
 220         return chunk
 221
 222     def _unread(self, buf):
 223         self.extrabuf = buf + self.extrabuf
 224         self.extrasize = len(buf) + self.extrasize
 225         self.offset -= len(buf)
 226
 227     def _read(self, size=1024):
 228         if self.fileobj is None: raise EOFError, "Reached EOF"
 229
 230         if self._new_member:
 231             # If the _new_member flag is set, we have to
 232             # jump to the next member, if there is one.
 233             #
 234             # First, check if we're at the end of the file;
 235             # if so, it's time to stop; no more members to read.
 236             pos = self.fileobj.tell()   # Save current position
 237             self.fileobj.seek(0, 2)     # Seek to end of file
 238             if pos == self.fileobj.tell():
 239                 raise EOFError, "Reached EOF"
 240             else:
 241                 self.fileobj.seek( pos ) # Return to original position
 242
 243             self._init_read()
 244             self._read_gzip_header()
 245             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 246             self._new_member = False
 247
 248         # Read a chunk of data from the file
 249         buf = self.fileobj.read(size)
 250
 251         # If the EOF has been reached, flush the decompression object
 252         # and mark this object as finished.
 253
 254         if buf == "":
 255             uncompress = self.decompress.flush()
 256             self._read_eof()
 257             self._add_read_data( uncompress )
 258             raise EOFError, 'Reached EOF'
 259
 260         uncompress = self.decompress.decompress(buf)
 261         self._add_read_data( uncompress )
 262
 263         if self.decompress.unused_data != "":
 264             # Ending case: we've come to the end of a member in the file,
 265             # so seek back to the start of the unused data, finish up
 266             # this member, and read a new gzip header.
 267             # (The number of bytes to seek back is the length of the unused
 268             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 269             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 270
 271             # Check the CRC and file size, and set the flag so we read
 272             # a new member on the next call
 273             self._read_eof()
 274             self._new_member = True
 275
 276     def _add_read_data(self, data):
 277         self.crc = zlib.crc32(data, self.crc)
 278         self.extrabuf = self.extrabuf + data
 279         self.extrasize = self.extrasize + len(data)
 280         self.size = self.size + len(data)
 281
 282     def _read_eof(self):
 283         # We've read to the end of the file, so we have to rewind in order
 284         # to reread the 8 bytes containing the CRC and the file size.
 285         # We check the that the computed CRC and size of the
 286         # uncompressed data matches the stored values.
 287         self.fileobj.seek(-8, 1)
 288         crc32 = read32(self.fileobj)
 289         isize = read32(self.fileobj)
 290         if crc32%0x100000000L != self.crc%0x100000000L:
 291             raise ValueError, "CRC check failed"
 292         elif isize != self.size:
 293             raise ValueError, "Incorrect length of data produced"
 294
 295     def close(self):
 296         if self.mode == WRITE:
 297             self.fileobj.write(self.compress.flush())
 298             write32(self.fileobj, self.crc)
 299             write32(self.fileobj, self.size)
 300             self.fileobj = None
 301         elif self.mode == READ:
 302             self.fileobj = None
 303         if self.myfileobj:
 304             self.myfileobj.close()
 305             self.myfileobj = None
 306
 307     def __del__(self):
 308         try:
 309             if (self.myfileobj is None and
 310                 self.fileobj is None):
 311                 return
 312         except AttributeError:
 313             return
 314         self.close()
 315
 316     def flush(self):
 317         self.fileobj.flush()
 318
 319     def isatty(self):
 320         return False
 321
 322     def tell(self):
 323         return self.offset
 324
 325     def rewind(self):
 326         '''Return the uncompressed stream file position indicator to the
 327         beginning of the file'''
 328         if self.mode != READ:
 329             raise IOError("Can't rewind in write mode")
 330         self.fileobj.seek(0)
 331         self._new_member = True
 332         self.extrabuf = ""
 333         self.extrasize = 0
 334         self.offset = 0
 335
 336     def seek(self, offset):
 337         if self.mode == WRITE:
 338             if offset < self.offset:
 339                 raise IOError('Negative seek in write mode')
 340             count = offset - self.offset
 341             for i in range(count/1024):
 342                 self.write(1024*'\0')
 343             self.write((count%1024)*'\0')
 344         elif self.mode == READ:
 345             if offset < self.offset:
 346                 # for negative seek, rewind and do positive seek
 347                 self.rewind()
 348             count = offset - self.offset
 349             for i in range(count/1024): self.read(1024)
 350             self.read(count % 1024)
 351
 352     def readline(self, size=-1):
 353         if size < 0: size = sys.maxint
 354         bufs = []
 355         readsize = min(100, size)    # Read from the file in small chunks
 356         while True:
 357             if size == 0:
 358                 return "".join(bufs) # Return resulting line
 359
 360             c = self.read(readsize)
 361             i = c.find('\n')
 362             if size is not None:
 363                 # We set i=size to break out of the loop under two
 364                 # conditions: 1) there's no newline, and the chunk is
 365                 # larger than size, or 2) there is a newline, but the
 366                 # resulting line would be longer than 'size'.
 367                 if i==-1 and len(c) > size: i=size-1
 368                 elif size <= i: i = size -1
 369
 370             if i >= 0 or c == '':
 371                 bufs.append(c[:i+1])    # Add portion of last chunk
 372                 self._unread(c[i+1:])   # Push back rest of chunk
 373                 return ''.join(bufs)    # Return resulting line
 374
 375             # Append chunk to list, decrease 'size',
 376             bufs.append(c)
 377             size = size - len(c)
 378             readsize = min(size, readsize * 2)
 379
 380     def readlines(self, sizehint=0):
 381         # Negative numbers result in reading all the lines
 382         if sizehint <= 0: sizehint = sys.maxint
 383         L = []
 384         while sizehint > 0:
 385             line = self.readline()
 386             if line == "": break
 387             L.append(line)
 388             sizehint = sizehint - len(line)
 389
 390         return L
 391
 392     def writelines(self, L):
 393         for line in L:
 394             self.write(line)
 395
 396     def __iter__(self):
 397         return self
 398
 399     def next(self):
 400         line = self.readline()
 401         if line:
 402             return line
 403         else:
 404             raise StopIteration
 405
 406
 407 def _test():
 408     # Act like gzip; with -d, act like gunzip.
 409     # The input file is not deleted, however, nor are any other gzip
 410     # options or features supported.
 411     args = sys.argv[1:]
 412     decompress = args and args[0] == "-d"
 413     if decompress:
 414         args = args[1:]
 415     if not args:
 416         args = ["-"]
 417     for arg in args:
 418         if decompress:
 419             if arg == "-":
 420                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 421                 g = sys.stdout
 422             else:
 423                 if arg[-3:] != ".gz":
 424                     print "filename doesn't end in .gz:", `arg`
 425                     continue
 426                 f = open(arg, "rb")
 427                 g = __builtin__.open(arg[:-3], "wb")
 428         else:
 429             if arg == "-":
 430                 f = sys.stdin
 431                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 432             else:
 433                 f = __builtin__.open(arg, "rb")
 434                 g = open(arg + ".gz", "wb")
 435         while True:
 436             chunk = f.read(1024)
 437             if not chunk:
 438                 break
 439             g.write(chunk)
 440         if g is not sys.stdout:
 441             g.close()
 442         if f is not sys.stdin:
 443             f.close()
 444
 445 if __name__ == '__main__':
 446     _test()