Lib/gzip.py

   1 """Functions that read and write gzipped files.
   2
   3 The user of the file doesn't have to worry about the compression,
   4 but random access is not allowed."""
   5
   6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
   7
   8 import struct, sys, time
   9 import zlib
  10 import __builtin__
  11
  12 __all__ = ["GzipFile","open"]
  13
  14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  15
  16 READ, WRITE = 1, 2
  17
  18 def write32(output, value):
  19     output.write(struct.pack("<l", value))
  20
  21 def write32u(output, value):
  22     if value < 0:
  23         value = value + 0x100000000L
  24     output.write(struct.pack("<L", value))
  25
  26 def read32(input):
  27     return struct.unpack("<l", input.read(4))[0]
  28
  29 def open(filename, mode="rb", compresslevel=9):
  30     return GzipFile(filename, mode, compresslevel)
  31
  32 class GzipFile:
  33
  34     myfileobj = None
  35
  36     def __init__(self, filename=None, mode=None,
  37                  compresslevel=9, fileobj=None):
  38         if fileobj is None:
  39             fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  40         if filename is None:
  41             if hasattr(fileobj, 'name'): filename = fileobj.name
  42             else: filename = ''
  43         if mode is None:
  44             if hasattr(fileobj, 'mode'): mode = fileobj.mode
  45             else: mode = 'rb'
  46
  47         if mode[0:1] == 'r':
  48             self.mode = READ
  49             # Set flag indicating start of a new member
  50             self._new_member = 1
  51             self.extrabuf = ""
  52             self.extrasize = 0
  53             self.filename = filename
  54
  55         elif mode[0:1] == 'w' or mode[0:1] == 'a':
  56             self.mode = WRITE
  57             self._init_write(filename)
  58             self.compress = zlib.compressobj(compresslevel,
  59                                              zlib.DEFLATED,
  60                                              -zlib.MAX_WBITS,
  61                                              zlib.DEF_MEM_LEVEL,
  62                                              0)
  63         else:
  64             raise ValueError, "Mode " + mode + " not supported"
  65
  66         self.fileobj = fileobj
  67         self.offset = 0
  68
  69         if self.mode == WRITE:
  70             self._write_gzip_header()
  71
  72     def __repr__(self):
  73         s = repr(self.fileobj)
  74         return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
  75
  76     def _init_write(self, filename):
  77         if filename[-3:] != '.gz':
  78             filename = filename + '.gz'
  79         self.filename = filename
  80         self.crc = zlib.crc32("")
  81         self.size = 0
  82         self.writebuf = []
  83         self.bufsize = 0
  84
  85     def _write_gzip_header(self):
  86         self.fileobj.write('\037\213')             # magic header
  87         self.fileobj.write('\010')                 # compression method
  88         fname = self.filename[:-3]
  89         flags = 0
  90         if fname:
  91             flags = FNAME
  92         self.fileobj.write(chr(flags))
  93         write32u(self.fileobj, long(time.time()))
  94         self.fileobj.write('\002')
  95         self.fileobj.write('\377')
  96         if fname:
  97             self.fileobj.write(fname + '\000')
  98
  99     def _init_read(self):
 100         self.crc = zlib.crc32("")
 101         self.size = 0
 102
 103     def _read_gzip_header(self):
 104         magic = self.fileobj.read(2)
 105         if magic != '\037\213':
 106             raise IOError, 'Not a gzipped file'
 107         method = ord( self.fileobj.read(1) )
 108         if method != 8:
 109             raise IOError, 'Unknown compression method'
 110         flag = ord( self.fileobj.read(1) )
 111         # modtime = self.fileobj.read(4)
 112         # extraflag = self.fileobj.read(1)
 113         # os = self.fileobj.read(1)
 114         self.fileobj.read(6)
 115
 116         if flag & FEXTRA:
 117             # Read & discard the extra field, if present
 118             xlen=ord(self.fileobj.read(1))
 119             xlen=xlen+256*ord(self.fileobj.read(1))
 120             self.fileobj.read(xlen)
 121         if flag & FNAME:
 122             # Read and discard a null-terminated string containing the filename
 123             while (1):
 124                 s=self.fileobj.read(1)
 125                 if not s or s=='\000': break
 126         if flag & FCOMMENT:
 127             # Read and discard a null-terminated string containing a comment
 128             while (1):
 129                 s=self.fileobj.read(1)
 130                 if not s or s=='\000': break
 131         if flag & FHCRC:
 132             self.fileobj.read(2)     # Read & discard the 16-bit header CRC
 133
 134
 135     def write(self,data):
 136         if self.fileobj is None:
 137             raise ValueError, "write() on closed GzipFile object"
 138         if len(data) > 0:
 139             self.size = self.size + len(data)
 140             self.crc = zlib.crc32(data, self.crc)
 141             self.fileobj.write( self.compress.compress(data) )
 142             self.offset += len(data)
 143
 144     def read(self, size=-1):
 145         if self.extrasize <= 0 and self.fileobj is None:
 146             return ''
 147
 148         readsize = 1024
 149         if size < 0:        # get the whole thing
 150             try:
 151                 while 1:
 152                     self._read(readsize)
 153                     readsize = readsize * 2
 154             except EOFError:
 155                 size = self.extrasize
 156         else:               # just get some more of it
 157             try:
 158                 while size > self.extrasize:
 159                     self._read(readsize)
 160                     readsize = readsize * 2
 161             except EOFError:
 162                 if size > self.extrasize:
 163                     size = self.extrasize
 164
 165         chunk = self.extrabuf[:size]
 166         self.extrabuf = self.extrabuf[size:]
 167         self.extrasize = self.extrasize - size
 168
 169         self.offset += size
 170         return chunk
 171
 172     def _unread(self, buf):
 173         self.extrabuf = buf + self.extrabuf
 174         self.extrasize = len(buf) + self.extrasize
 175         self.offset -= len(buf)
 176
 177     def _read(self, size=1024):
 178         if self.fileobj is None: raise EOFError, "Reached EOF"
 179
 180         if self._new_member:
 181             # If the _new_member flag is set, we have to
 182             # jump to the next member, if there is one.
 183             #
 184             # First, check if we're at the end of the file;
 185             # if so, it's time to stop; no more members to read.
 186             pos = self.fileobj.tell()   # Save current position
 187             self.fileobj.seek(0, 2)     # Seek to end of file
 188             if pos == self.fileobj.tell():
 189                 raise EOFError, "Reached EOF"
 190             else:
 191                 self.fileobj.seek( pos ) # Return to original position
 192
 193             self._init_read()
 194             self._read_gzip_header()
 195             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
 196             self._new_member = 0
 197
 198         # Read a chunk of data from the file
 199         buf = self.fileobj.read(size)
 200
 201         # If the EOF has been reached, flush the decompression object
 202         # and mark this object as finished.
 203
 204         if buf == "":
 205             uncompress = self.decompress.flush()
 206             self._read_eof()
 207             self._add_read_data( uncompress )
 208             raise EOFError, 'Reached EOF'
 209
 210         uncompress = self.decompress.decompress(buf)
 211         self._add_read_data( uncompress )
 212
 213         if self.decompress.unused_data != "":
 214             # Ending case: we've come to the end of a member in the file,
 215             # so seek back to the start of the unused data, finish up
 216             # this member, and read a new gzip header.
 217             # (The number of bytes to seek back is the length of the unused
 218             # data, minus 8 because _read_eof() will rewind a further 8 bytes)
 219             self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
 220
 221             # Check the CRC and file size, and set the flag so we read
 222             # a new member on the next call
 223             self._read_eof()
 224             self._new_member = 1
 225
 226     def _add_read_data(self, data):
 227         self.crc = zlib.crc32(data, self.crc)
 228         self.extrabuf = self.extrabuf + data
 229         self.extrasize = self.extrasize + len(data)
 230         self.size = self.size + len(data)
 231
 232     def _read_eof(self):
 233         # We've read to the end of the file, so we have to rewind in order
 234         # to reread the 8 bytes containing the CRC and the file size.
 235         # We check the that the computed CRC and size of the
 236         # uncompressed data matches the stored values.
 237         self.fileobj.seek(-8, 1)
 238         crc32 = read32(self.fileobj)
 239         isize = read32(self.fileobj)
 240         if crc32%0x100000000L != self.crc%0x100000000L:
 241             raise ValueError, "CRC check failed"
 242         elif isize != self.size:
 243             raise ValueError, "Incorrect length of data produced"
 244
 245     def close(self):
 246         if self.mode == WRITE:
 247             self.fileobj.write(self.compress.flush())
 248             write32(self.fileobj, self.crc)
 249             write32(self.fileobj, self.size)
 250             self.fileobj = None
 251         elif self.mode == READ:
 252             self.fileobj = None
 253         if self.myfileobj:
 254             self.myfileobj.close()
 255             self.myfileobj = None
 256
 257     def __del__(self):
 258         try:
 259             if (self.myfileobj is None and
 260                 self.fileobj is None):
 261                 return
 262         except AttributeError:
 263             return
 264         self.close()
 265
 266     def flush(self):
 267         self.fileobj.flush()
 268
 269     def isatty(self):
 270         return 0
 271
 272     def tell(self):
 273         return self.offset
 274
 275     def rewind(self):
 276         '''Return the uncompressed stream file position indicator to the
 277         beginning of the file'''
 278         if self.mode != READ:
 279             raise IOError("Can't rewind in write mode")
 280         self.fileobj.seek(0)
 281         self._new_member = 1
 282         self.extrabuf = ""
 283         self.extrasize = 0
 284         self.offset = 0
 285
 286     def seek(self, offset):
 287         if self.mode == WRITE:
 288             if offset < self.offset:
 289                 raise IOError('Negative seek in write mode')
 290             count = offset - self.offset
 291             for i in range(count/1024):
 292                 f.write(1024*'\0')
 293             self.write((count%1024)*'\0')
 294         elif self.mode == READ:
 295             if offset < self.offset:
 296                 # for negative seek, rewind and do positive seek
 297                 self.rewind()
 298             count = offset - self.offset
 299             for i in range(count/1024): self.read(1024)
 300             self.read(count % 1024)
 301
 302     def readline(self, size=-1):
 303         if size < 0: size = sys.maxint
 304         bufs = []
 305         readsize = min(100, size)    # Read from the file in small chunks
 306         while 1:
 307             if size == 0:
 308                 return "".join(bufs) # Return resulting line
 309
 310             c = self.read(readsize)
 311             i = c.find('\n')
 312             if size is not None:
 313                 # We set i=size to break out of the loop under two
 314                 # conditions: 1) there's no newline, and the chunk is
 315                 # larger than size, or 2) there is a newline, but the
 316                 # resulting line would be longer than 'size'.
 317                 if i==-1 and len(c) > size: i=size-1
 318                 elif size <= i: i = size -1
 319
 320             if i >= 0 or c == '':
 321                 bufs.append(c[:i+1])    # Add portion of last chunk
 322                 self._unread(c[i+1:])   # Push back rest of chunk
 323                 return ''.join(bufs)    # Return resulting line
 324
 325             # Append chunk to list, decrease 'size',
 326             bufs.append(c)
 327             size = size - len(c)
 328             readsize = min(size, readsize * 2)
 329
 330     def readlines(self, sizehint=0):
 331         # Negative numbers result in reading all the lines
 332         if sizehint <= 0: sizehint = sys.maxint
 333         L = []
 334         while sizehint > 0:
 335             line = self.readline()
 336             if line == "": break
 337             L.append( line )
 338             sizehint = sizehint - len(line)
 339
 340         return L
 341
 342     def writelines(self, L):
 343         for line in L:
 344             self.write(line)
 345
 346
 347 def _test():
 348     # Act like gzip; with -d, act like gunzip.
 349     # The input file is not deleted, however, nor are any other gzip
 350     # options or features supported.
 351     args = sys.argv[1:]
 352     decompress = args and args[0] == "-d"
 353     if decompress:
 354         args = args[1:]
 355     if not args:
 356         args = ["-"]
 357     for arg in args:
 358         if decompress:
 359             if arg == "-":
 360                 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
 361                 g = sys.stdout
 362             else:
 363                 if arg[-3:] != ".gz":
 364                     print "filename doesn't end in .gz:", `arg`
 365                     continue
 366                 f = open(arg, "rb")
 367                 g = __builtin__.open(arg[:-3], "wb")
 368         else:
 369             if arg == "-":
 370                 f = sys.stdin
 371                 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
 372             else:
 373                 f = __builtin__.open(arg, "rb")
 374                 g = open(arg + ".gz", "wb")
 375         while 1:
 376             chunk = f.read(1024)
 377             if not chunk:
 378                 break
 379             g.write(chunk)
 380         if g is not sys.stdout:
 381             g.close()
 382         if f is not sys.stdin:
 383             f.close()
 384
 385 if __name__ == '__main__':
 386     _test()