Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import builtins
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def U32(i):
  19     """Return i as an unsigned integer, assuming it fits in 32 bits.
  20     If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
  21     """
  22     if i < 0:
  23         i += 1 << 32
  24     return i
  25
  26 def LOWU32(i):
  27     """Return the low-order 32 bits, as a non-negative int"""
  28     return i & 0xFFFFFFFF
  29
  30 def write32u(output, value):
  31     # The L format writes the bit pattern correctly whether signed
  32     # or unsigned.
  33     output.write(struct.pack("<L", value))
  34
  35 def read32(input):
  36     return struct.unpack("<I", input.read(4))[0]
  37
  38 def open(filename, mode="rb", compresslevel=9):
  39     """Shorthand for GzipFile(filename, mode, compresslevel).
  40
  41     The filename argument is required; mode defaults to 'rb'
  42     and compresslevel defaults to 9.
  43
  44     """
  45     return GzipFile(filename, mode, compresslevel)
  46
  47 class GzipFile:
  48     """The GzipFile class simulates most of the methods of a file object with
  49     the exception of the readinto() and truncate() methods.
  50
  51     """
  52
  53     myfileobj = None
  54     max_read_chunk = 10 * 1024 * 1024   # 10Mb
  55
  56     def __init__(self, filename=None, mode=None,
  57                  compresslevel=9, fileobj=None):
  58         """Constructor for the GzipFile class.
  59
  60         At least one of fileobj and filename must be given a
  61         non-trivial value.
  62
  63         The new class instance is based on fileobj, which can be a regular
  64         file, a StringIO object, or any other object which simulates a file.
  65         It defaults to None, in which case filename is opened to provide
  66         a file object.
  67
  68         When fileobj is not None, the filename argument is only used to be
  69         included in the gzip file header, which may includes the original
  70         filename of the uncompressed file.  It defaults to the filename of
  71         fileobj, if discernible; otherwise, it defaults to the empty string,
  72         and in this case the original filename is not included in the header.
  73
  74         The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
  75         depending on whether the file will be read or written.  The default
  76         is the mode of fileobj if discernible; otherwise, the default is 'rb'.
  77         Be aware that only the 'rb', 'ab', and 'wb' values should be used
  78         for cross-platform portability.
  79
  80         The compresslevel argument is an integer from 1 to 9 controlling the
  81         level of compression; 1 is fastest and produces the least compression,
  82         and 9 is slowest and produces the most compression.  The default is 9.
  83
  84         """
  85
  86         # guarantee the file is opened in binary mode on platforms
  87         # that care about that sort of thing
  88         if mode and 'b' not in mode:
  89             mode += 'b'
  90         if fileobj is None:
  91             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
  92         if filename is None:
  93             if hasattr(fileobj, 'name'): filename = fileobj.name
  94             else: filename = ''
  95         if mode is None:
  96             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  97             else: mode = 'rb'
  98
  99         if mode[0:1] == 'r':
 100             self.mode = READ
 101             # Set flag indicating start of a new member
 102             self._new_member = True
 103             self.extrabuf = b""
 104             self.extrasize = 0
 105             self.name = filename
 106             # Starts small, scales exponentially
 107             self.min_readsize = 100
 108
 109         elif mode[0:1] == 'w' or mode[0:1] == 'a':
 110             self.mode = WRITE
 111             self._init_write(filename)
 112             self.compress = zlib.compressobj(compresslevel,
 113                                              zlib.DEFLATED,
 114                                              -zlib.MAX_WBITS,
 115                                              zlib.DEF_MEM_LEVEL,
 116                                              0)
 117         else:
 118             raise IOError("Mode " + mode + " not supported")
 119
 120         self.fileobj = fileobj
 121         self.offset = 0
 122
 123         if self.mode == WRITE:
 124             self._write_gzip_header()
 125
 126     @property
 127     def filename(self):
 128         import warnings
 129         warnings.warn("use the name attribute", DeprecationWarning)
 130         if self.mode == WRITE and self.name[-3:] != ".gz":
 131             return self.name + ".gz"
 132         return self.name
 133
 134     def __repr__(self):
 135         s = repr(self.fileobj)
 136         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
 137
 138     def _init_write(self, filename):
 139         self.name = filename
 140         self.crc = zlib.crc32("") & 0xffffffff
 141         self.size = 0
 142         self.writebuf = []
 143         self.bufsize = 0
 144
 145     def _write_gzip_header(self):
 146         self.fileobj.write(b'\037\213')             # magic header
 147         self.fileobj.write(b'\010')                 # compression method
 148         try:
 149             # RFC 1952 requires the FNAME field to be Latin-1. Do not
 150             # include filenames that cannot be represented that way.
 151             fname = self.name.encode('latin-1')
 152             if fname.endswith(b'.gz'):
 153                 fname = fname[:-3]
 154         except UnicodeEncodeError:
 155             fname = b''
 156         flags = 0
 157         if fname:
 158             flags = FNAME
 159         self.fileobj.write(chr(flags).encode('latin-1'))
 160         write32u(self.fileobj, int(time.time()))
 161         self.fileobj.write(b'\002')
 162         self.fileobj.write(b'\377')
 163         if fname:
 164             self.fileobj.write(fname + b'\000')
 165
 166     def _init_read(self):
 167         self.crc = zlib.crc32("") & 0xffffffff
 168         self.size = 0
 169
 170     def _read_gzip_header(self):
 171         magic = self.fileobj.read(2)
 172         if magic != b'\037\213':
 173             raise IOError('Not a gzipped file')
 174         method = ord( self.fileobj.read(1) )
 175         if method != 8:
 176             raise IOError('Unknown compression method')
 177         flag = ord( self.fileobj.read(1) )
 178         # modtime = self.fileobj.read(4)
 179         # extraflag = self.fileobj.read(1)
 180         # os = self.fileobj.read(1)
 181         self.fileobj.read(6)
 182
 183         if flag & FEXTRA:
 184             # Read & discard the extra field, if present
 185             xlen = ord(self.fileobj.read(1))
 186             xlen = xlen + 256*ord(self.fileobj.read(1))
 187             self.fileobj.read(xlen)
 188         if flag & FNAME:
 189             # Read and discard a null-terminated string containing the filename
 190             while True:
 191                 s = self.fileobj.read(1)
 192                 if not s or s==b'\000':
 193                     break
 194         if flag & FCOMMENT:
 195             # Read and discard a null-terminated string containing a comment
 196             while True:
 197                 s = self.fileobj.read(1)
 198                 if not s or s==b'\000':
 199                     break
 200         if flag & FHCRC:
 201             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 202
 203
 204     def write(self,data):
 205         if self.mode != WRITE:
 206             import errno
 207             raise IOError(errno.EBADF, "write() on read-only GzipFile object")
 208
 209         if self.fileobj is None:
 210             raise ValueError("write() on closed GzipFile object")
 211         if len(data) > 0:
 212             self.size = self.size + len(data)
 213             self.crc = zlib.crc32(data, self.crc) & 0xffffffff
 214             self.fileobj.write( self.compress.compress(data) )
 215             self.offset += len(data)
 216
 217     def read(self, size=-1):
 218         if self.mode != READ:
 219             import errno
 220             raise IOError(errno.EBADF, "read() on write-only GzipFile object")
 221
 222         if self.extrasize <= 0 and self.fileobj is None:
 223             return b''
 224
 225         readsize = 1024
 226         if size < 0:        # get the whole thing
 227             try:
 228                 while True:
 229                     self._read(readsize)
 230                     readsize = min(self.max_read_chunk, readsize * 2)
 231             except EOFError:
 232                 size = self.extrasize
 233         else:               # just get some more of it
 234             try:
 235                 while size > self.extrasize:
 236                     self._read(readsize)
 237                     readsize = min(self.max_read_chunk, readsize * 2)
 238             except EOFError:
 239                 if size > self.extrasize:
 240                     size = self.extrasize
 241
 242         chunk = self.extrabuf[:size]
 243         self.extrabuf = self.extrabuf[size:]
 244         self.extrasize = self.extrasize - size
 245
 246         self.offset += size
 247         return chunk
 248
 249     def _unread(self, buf):
 250         self.extrabuf = buf + self.extrabuf
 251         self.extrasize = len(buf) + self.extrasize
 252         self.offset -= len(buf)
 253
 254     def _read(self, size=1024):
 255         if self.fileobj is None:
 256             raise EOFError("Reached EOF")
 257
 258         if self._new_member:
 259             # If the _new_member flag is set, we have to
 260             # jump to the next member, if there is one.
 261             #
 262             # First, check if we're at the end of the file;
 263             # if so, it's time to stop; no more members to read.
 264             pos = self.fileobj.tell()   # Save current position
 265             self.fileobj.seek(0, 2)     # Seek to end of file
 266             if pos == self.fileobj.tell():
 267                 raise EOFError("Reached EOF")
 268             else:
 269                 self.fileobj.seek( pos ) # Return to original position
 270
 271             self._init_read()
 272             self._read_gzip_header()
 273             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 274             self._new_member = False
 275
 276         # Read a chunk of data from the file
 277         buf = self.fileobj.read(size)
 278
 279         # If the EOF has been reached, flush the decompression object
 280         # and mark this object as finished.
 281
 282         if buf == b"":
 283             uncompress = self.decompress.flush()
 284             self._read_eof()
 285             self._add_read_data( uncompress )
 286             raise EOFError('Reached EOF')
 287
 288         uncompress = self.decompress.decompress(buf)
 289         self._add_read_data( uncompress )
 290
 291         if self.decompress.unused_data != b"":
 292             # Ending case: we've come to the end of a member in the file,
 293             # so seek back to the start of the unused data, finish up
 294             # this member, and read a new gzip header.
 295             # (The number of bytes to seek back is the length of the unused
 296             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 297             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 298
 299             # Check the CRC and file size, and set the flag so we read
 300             # a new member on the next call
 301             self._read_eof()
 302             self._new_member = True
 303
 304     def _add_read_data(self, data):
 305         self.crc = zlib.crc32(data, self.crc) & 0xffffffff
 306         self.extrabuf = self.extrabuf + data
 307         self.extrasize = self.extrasize + len(data)
 308         self.size = self.size + len(data)
 309
 310     def _read_eof(self):
 311         # We've read to the end of the file, so we have to rewind in order
 312         # to reread the 8 bytes containing the CRC and the file size.
 313         # We check the that the computed CRC and size of the
 314         # uncompressed data matches the stored values.  Note that the size
 315         # stored is the true file size mod 2**32.
 316         self.fileobj.seek(-8, 1)
 317         crc32 = read32(self.fileobj)
 318         isize = read32(self.fileobj)  # may exceed 2GB
 319         if crc32 != self.crc:
 320             raise IOError("CRC check failed %s != %s" % (hex(crc32),
 321                                                          hex(self.crc)))
 322         elif isize != (self.size & 0xffffffff):
 323             raise IOError("Incorrect length of data produced")
 324
 325     def close(self):
 326         if self.mode == WRITE:
 327             self.fileobj.write(self.compress.flush())
 328             write32u(self.fileobj, self.crc)
 329             # self.size may exceed 2GB, or even 4GB
 330             write32u(self.fileobj, self.size & 0xffffffff)
 331             self.fileobj = None
 332         elif self.mode == READ:
 333             self.fileobj = None
 334         if self.myfileobj:
 335             self.myfileobj.close()
 336             self.myfileobj = None
 337
 338     def __del__(self):
 339         try:
 340             if (self.myfileobj is None and
 341                 self.fileobj is None):
 342                 return
 343         except AttributeError:
 344             return
 345         self.close()
 346
 347     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
 348         if self.mode == WRITE:
 349             # Ensure the compressor's buffer is flushed
 350             self.fileobj.write(self.compress.flush(zlib_mode))
 351         self.fileobj.flush()
 352
 353     def fileno(self):
 354         """Invoke the underlying file object's fileno() method.
 355
 356         This will raise AttributeError if the underlying file object
 357         doesn't support fileno().
 358         """
 359         return self.fileobj.fileno()
 360
 361     def isatty(self):
 362         return False
 363
 364     def tell(self):
 365         return self.offset
 366
 367     def rewind(self):
 368         '''Return the uncompressed stream file position indicator to the
 369         beginning of the file'''
 370         if self.mode != READ:
 371             raise IOError("Can't rewind in write mode")
 372         self.fileobj.seek(0)
 373         self._new_member = True
 374         self.extrabuf = b""
 375         self.extrasize = 0
 376         self.offset = 0
 377
 378     def seek(self, offset, whence=0):
 379         if whence:
 380             if whence == 1:
 381                 offset = self.offset + offset
 382             else:
 383                 raise ValueError('Seek from end not supported')
 384         if self.mode == WRITE:
 385             if offset < self.offset:
 386                 raise IOError('Negative seek in write mode')
 387             count = offset - self.offset
 388             chunk = bytes(1024)
 389             for i in range(count // 1024):
 390                 self.write(chunk)
 391             self.write(bytes(count % 1024))
 392         elif self.mode == READ:
 393             if offset < self.offset:
 394                 # for negative seek, rewind and do positive seek
 395                 self.rewind()
 396             count = offset - self.offset
 397             for i in range(count // 1024):
 398                 self.read(1024)
 399             self.read(count % 1024)
 400
 401     def readline(self, size=-1):
 402         if size < 0:
 403             size = sys.maxsize
 404             readsize = self.min_readsize
 405         else:
 406             readsize = size
 407         bufs = []
 408         while size != 0:
 409             c = self.read(readsize)
 410             i = c.find(b'\n')
 411
 412             # We set i=size to break out of the loop under two
 413             # conditions: 1) there's no newline, and the chunk is
 414             # larger than size, or 2) there is a newline, but the
 415             # resulting line would be longer than 'size'.
 416             if (size <= i) or (i == -1 and len(c) > size):
 417                 i = size - 1
 418
 419             if i >= 0 or c == b'':
 420                 bufs.append(c[:i + 1])    # Add portion of last chunk
 421                 self._unread(c[i + 1:])   # Push back rest of chunk
 422                 break
 423
 424             # Append chunk to list, decrease 'size',
 425             bufs.append(c)
 426             size = size - len(c)
 427             readsize = min(size, readsize * 2)
 428         if readsize > self.min_readsize:
 429             self.min_readsize = min(readsize, self.min_readsize * 2, 512)
 430         return b''.join(bufs) # Return resulting line
 431
 432     def readlines(self, sizehint=0):
 433         # Negative numbers result in reading all the lines
 434         if sizehint <= 0:
 435             sizehint = sys.maxsize
 436         L = []
 437         while sizehint > 0:
 438             line = self.readline()
 439             if line == b"":
 440                 break
 441             L.append(line)
 442             sizehint = sizehint - len(line)
 443
 444         return L
 445
 446     def writelines(self, L):
 447         for line in L:
 448             self.write(line)
 449
 450     def __iter__(self):
 451         return self
 452
 453     def __next__(self):
 454         line = self.readline()
 455         if line:
 456             return line
 457         else:
 458             raise StopIteration
 459
 460
 461 def _test():
 462     # Act like gzip; with -d, act like gunzip.
 463     # The input file is not deleted, however, nor are any other gzip
 464     # options or features supported.
 465     args = sys.argv[1:]
 466     decompress = args and args[0] == "-d"
 467     if decompress:
 468         args = args[1:]
 469     if not args:
 470         args = ["-"]
 471     for arg in args:
 472         if decompress:
 473             if arg == "-":
 474                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 475                 g = sys.stdout
 476             else:
 477                 if arg[-3:] != ".gz":
 478                     print("filename doesn't end in .gz:", repr(arg))
 479                     continue
 480                 f = open(arg, "rb")
 481                 g = builtins.open(arg[:-3], "wb")
 482         else:
 483             if arg == "-":
 484                 f = sys.stdin
 485                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 486             else:
 487                 f = builtins.open(arg, "rb")
 488                 g = open(arg + ".gz", "wb")
 489         while True:
 490             chunk = f.read(1024)
 491             if not chunk:
 492                 break
 493             g.write(chunk)
 494         if g is not sys.stdout:
 495             g.close()
 496         if f is not sys.stdin:
 497             f.close()
 498
 499 if __name__ == '__main__':
 500     _test()