Update version number and release date.
[python/dscho.git] / Lib / gzip.py
blob761d94157ba6c2ec4563fd67c1196ee8a8765544
1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct, sys, time
9 import zlib
10 import __builtin__
12 __all__ = ["GzipFile","open"]
14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16 READ, WRITE = 1, 2
18 def U32(i):
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
21 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22 """
23 if i < 0:
24 i += 1L << 32
25 return i
27 def LOWU32(i):
28 """Return the low-order 32 bits of an int, as a non-negative int."""
29 return i & 0xFFFFFFFFL
31 def write32(output, value):
32 output.write(struct.pack("<l", value))
34 def write32u(output, value):
35 # The L format writes the bit pattern correctly whether signed
36 # or unsigned.
37 output.write(struct.pack("<L", value))
39 def read32(input):
40 return struct.unpack("<l", input.read(4))[0]
42 def open(filename, mode="rb", compresslevel=9):
43 """Shorthand for GzipFile(filename, mode, compresslevel).
45 The filename argument is required; mode defaults to 'rb'
46 and compresslevel defaults to 9.
48 """
49 return GzipFile(filename, mode, compresslevel)
51 class GzipFile:
52 """The GzipFile class simulates most of the methods of a file object with
53 the exception of the readinto() and truncate() methods.
55 """
57 myfileobj = None
59 def __init__(self, filename=None, mode=None,
60 compresslevel=9, fileobj=None):
61 """Constructor for the GzipFile class.
63 At least one of fileobj and filename must be given a
64 non-trivial value.
66 The new class instance is based on fileobj, which can be a regular
67 file, a StringIO object, or any other object which simulates a file.
68 It defaults to None, in which case filename is opened to provide
69 a file object.
71 When fileobj is not None, the filename argument is only used to be
72 included in the gzip file header, which may includes the original
73 filename of the uncompressed file. It defaults to the filename of
74 fileobj, if discernible; otherwise, it defaults to the empty string,
75 and in this case the original filename is not included in the header.
77 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
78 depending on whether the file will be read or written. The default
79 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
80 Be aware that only the 'rb', 'ab', and 'wb' values should be used
81 for cross-platform portability.
83 The compresslevel argument is an integer from 1 to 9 controlling the
84 level of compression; 1 is fastest and produces the least compression,
85 and 9 is slowest and produces the most compression. The default is 9.
87 """
89 # guarantee the file is opened in binary mode on platforms
90 # that care about that sort of thing
91 if mode and 'b' not in mode:
92 mode += 'b'
93 if fileobj is None:
94 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
95 if filename is None:
96 if hasattr(fileobj, 'name'): filename = fileobj.name
97 else: filename = ''
98 if mode is None:
99 if hasattr(fileobj, 'mode'): mode = fileobj.mode
100 else: mode = 'rb'
102 if mode[0:1] == 'r':
103 self.mode = READ
104 # Set flag indicating start of a new member
105 self._new_member = True
106 self.extrabuf = ""
107 self.extrasize = 0
108 self.filename = filename
110 elif mode[0:1] == 'w' or mode[0:1] == 'a':
111 self.mode = WRITE
112 self._init_write(filename)
113 self.compress = zlib.compressobj(compresslevel,
114 zlib.DEFLATED,
115 -zlib.MAX_WBITS,
116 zlib.DEF_MEM_LEVEL,
118 else:
119 raise IOError, "Mode " + mode + " not supported"
121 self.fileobj = fileobj
122 self.offset = 0
124 if self.mode == WRITE:
125 self._write_gzip_header()
127 def __repr__(self):
128 s = repr(self.fileobj)
129 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
131 def _init_write(self, filename):
132 if filename[-3:] != '.gz':
133 filename = filename + '.gz'
134 self.filename = filename
135 self.crc = zlib.crc32("")
136 self.size = 0
137 self.writebuf = []
138 self.bufsize = 0
140 def _write_gzip_header(self):
141 self.fileobj.write('\037\213') # magic header
142 self.fileobj.write('\010') # compression method
143 fname = self.filename[:-3]
144 flags = 0
145 if fname:
146 flags = FNAME
147 self.fileobj.write(chr(flags))
148 write32u(self.fileobj, long(time.time()))
149 self.fileobj.write('\002')
150 self.fileobj.write('\377')
151 if fname:
152 self.fileobj.write(fname + '\000')
154 def _init_read(self):
155 self.crc = zlib.crc32("")
156 self.size = 0
158 def _read_gzip_header(self):
159 magic = self.fileobj.read(2)
160 if magic != '\037\213':
161 raise IOError, 'Not a gzipped file'
162 method = ord( self.fileobj.read(1) )
163 if method != 8:
164 raise IOError, 'Unknown compression method'
165 flag = ord( self.fileobj.read(1) )
166 # modtime = self.fileobj.read(4)
167 # extraflag = self.fileobj.read(1)
168 # os = self.fileobj.read(1)
169 self.fileobj.read(6)
171 if flag & FEXTRA:
172 # Read & discard the extra field, if present
173 xlen = ord(self.fileobj.read(1))
174 xlen = xlen + 256*ord(self.fileobj.read(1))
175 self.fileobj.read(xlen)
176 if flag & FNAME:
177 # Read and discard a null-terminated string containing the filename
178 while True:
179 s = self.fileobj.read(1)
180 if not s or s=='\000':
181 break
182 if flag & FCOMMENT:
183 # Read and discard a null-terminated string containing a comment
184 while True:
185 s = self.fileobj.read(1)
186 if not s or s=='\000':
187 break
188 if flag & FHCRC:
189 self.fileobj.read(2) # Read & discard the 16-bit header CRC
192 def write(self,data):
193 if self.mode != WRITE:
194 import errno
195 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
197 if self.fileobj is None:
198 raise ValueError, "write() on closed GzipFile object"
199 if len(data) > 0:
200 self.size = self.size + len(data)
201 self.crc = zlib.crc32(data, self.crc)
202 self.fileobj.write( self.compress.compress(data) )
203 self.offset += len(data)
205 def read(self, size=-1):
206 if self.mode != READ:
207 import errno
208 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
210 if self.extrasize <= 0 and self.fileobj is None:
211 return ''
213 readsize = 1024
214 if size < 0: # get the whole thing
215 try:
216 while True:
217 self._read(readsize)
218 readsize = readsize * 2
219 except EOFError:
220 size = self.extrasize
221 else: # just get some more of it
222 try:
223 while size > self.extrasize:
224 self._read(readsize)
225 readsize = readsize * 2
226 except EOFError:
227 if size > self.extrasize:
228 size = self.extrasize
230 chunk = self.extrabuf[:size]
231 self.extrabuf = self.extrabuf[size:]
232 self.extrasize = self.extrasize - size
234 self.offset += size
235 return chunk
237 def _unread(self, buf):
238 self.extrabuf = buf + self.extrabuf
239 self.extrasize = len(buf) + self.extrasize
240 self.offset -= len(buf)
242 def _read(self, size=1024):
243 if self.fileobj is None:
244 raise EOFError, "Reached EOF"
246 if self._new_member:
247 # If the _new_member flag is set, we have to
248 # jump to the next member, if there is one.
250 # First, check if we're at the end of the file;
251 # if so, it's time to stop; no more members to read.
252 pos = self.fileobj.tell() # Save current position
253 self.fileobj.seek(0, 2) # Seek to end of file
254 if pos == self.fileobj.tell():
255 raise EOFError, "Reached EOF"
256 else:
257 self.fileobj.seek( pos ) # Return to original position
259 self._init_read()
260 self._read_gzip_header()
261 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
262 self._new_member = False
264 # Read a chunk of data from the file
265 buf = self.fileobj.read(size)
267 # If the EOF has been reached, flush the decompression object
268 # and mark this object as finished.
270 if buf == "":
271 uncompress = self.decompress.flush()
272 self._read_eof()
273 self._add_read_data( uncompress )
274 raise EOFError, 'Reached EOF'
276 uncompress = self.decompress.decompress(buf)
277 self._add_read_data( uncompress )
279 if self.decompress.unused_data != "":
280 # Ending case: we've come to the end of a member in the file,
281 # so seek back to the start of the unused data, finish up
282 # this member, and read a new gzip header.
283 # (The number of bytes to seek back is the length of the unused
284 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
285 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
287 # Check the CRC and file size, and set the flag so we read
288 # a new member on the next call
289 self._read_eof()
290 self._new_member = True
292 def _add_read_data(self, data):
293 self.crc = zlib.crc32(data, self.crc)
294 self.extrabuf = self.extrabuf + data
295 self.extrasize = self.extrasize + len(data)
296 self.size = self.size + len(data)
298 def _read_eof(self):
299 # We've read to the end of the file, so we have to rewind in order
300 # to reread the 8 bytes containing the CRC and the file size.
301 # We check the that the computed CRC and size of the
302 # uncompressed data matches the stored values. Note that the size
303 # stored is the true file size mod 2**32.
304 self.fileobj.seek(-8, 1)
305 crc32 = read32(self.fileobj)
306 isize = U32(read32(self.fileobj)) # may exceed 2GB
307 if U32(crc32) != U32(self.crc):
308 raise IOError, "CRC check failed"
309 elif isize != LOWU32(self.size):
310 raise IOError, "Incorrect length of data produced"
312 def close(self):
313 if self.mode == WRITE:
314 self.fileobj.write(self.compress.flush())
315 write32(self.fileobj, self.crc)
316 # self.size may exceed 2GB, or even 4GB
317 write32u(self.fileobj, LOWU32(self.size))
318 self.fileobj = None
319 elif self.mode == READ:
320 self.fileobj = None
321 if self.myfileobj:
322 self.myfileobj.close()
323 self.myfileobj = None
325 def __del__(self):
326 try:
327 if (self.myfileobj is None and
328 self.fileobj is None):
329 return
330 except AttributeError:
331 return
332 self.close()
334 def flush(self):
335 self.fileobj.flush()
337 def isatty(self):
338 return False
340 def tell(self):
341 return self.offset
343 def rewind(self):
344 '''Return the uncompressed stream file position indicator to the
345 beginning of the file'''
346 if self.mode != READ:
347 raise IOError("Can't rewind in write mode")
348 self.fileobj.seek(0)
349 self._new_member = True
350 self.extrabuf = ""
351 self.extrasize = 0
352 self.offset = 0
354 def seek(self, offset):
355 if self.mode == WRITE:
356 if offset < self.offset:
357 raise IOError('Negative seek in write mode')
358 count = offset - self.offset
359 for i in range(count // 1024):
360 self.write(1024 * '\0')
361 self.write((count % 1024) * '\0')
362 elif self.mode == READ:
363 if offset < self.offset:
364 # for negative seek, rewind and do positive seek
365 self.rewind()
366 count = offset - self.offset
367 for i in range(count // 1024):
368 self.read(1024)
369 self.read(count % 1024)
371 def readline(self, size=-1):
372 if size < 0: size = sys.maxint
373 bufs = []
374 readsize = min(100, size) # Read from the file in small chunks
375 while True:
376 if size == 0:
377 return "".join(bufs) # Return resulting line
379 c = self.read(readsize)
380 i = c.find('\n')
381 if size is not None:
382 # We set i=size to break out of the loop under two
383 # conditions: 1) there's no newline, and the chunk is
384 # larger than size, or 2) there is a newline, but the
385 # resulting line would be longer than 'size'.
386 if i==-1 and len(c) > size: i=size-1
387 elif size <= i: i = size -1
389 if i >= 0 or c == '':
390 bufs.append(c[:i+1]) # Add portion of last chunk
391 self._unread(c[i+1:]) # Push back rest of chunk
392 return ''.join(bufs) # Return resulting line
394 # Append chunk to list, decrease 'size',
395 bufs.append(c)
396 size = size - len(c)
397 readsize = min(size, readsize * 2)
399 def readlines(self, sizehint=0):
400 # Negative numbers result in reading all the lines
401 if sizehint <= 0:
402 sizehint = sys.maxint
403 L = []
404 while sizehint > 0:
405 line = self.readline()
406 if line == "":
407 break
408 L.append(line)
409 sizehint = sizehint - len(line)
411 return L
413 def writelines(self, L):
414 for line in L:
415 self.write(line)
417 def __iter__(self):
418 return self
420 def next(self):
421 line = self.readline()
422 if line:
423 return line
424 else:
425 raise StopIteration
428 def _test():
429 # Act like gzip; with -d, act like gunzip.
430 # The input file is not deleted, however, nor are any other gzip
431 # options or features supported.
432 args = sys.argv[1:]
433 decompress = args and args[0] == "-d"
434 if decompress:
435 args = args[1:]
436 if not args:
437 args = ["-"]
438 for arg in args:
439 if decompress:
440 if arg == "-":
441 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
442 g = sys.stdout
443 else:
444 if arg[-3:] != ".gz":
445 print "filename doesn't end in .gz:", `arg`
446 continue
447 f = open(arg, "rb")
448 g = __builtin__.open(arg[:-3], "wb")
449 else:
450 if arg == "-":
451 f = sys.stdin
452 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
453 else:
454 f = __builtin__.open(arg, "rb")
455 g = open(arg + ".gz", "wb")
456 while True:
457 chunk = f.read(1024)
458 if not chunk:
459 break
460 g.write(chunk)
461 if g is not sys.stdout:
462 g.close()
463 if f is not sys.stdin:
464 f.close()
466 if __name__ == '__main__':
467 _test()