Fix the tag.
[python/dscho.git] / Lib / gzip.py
blob45fae9facf869ee62a04e96e410bd88e0780cafe
1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct, sys, time
9 import zlib
10 import builtins
12 __all__ = ["GzipFile","open"]
14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16 READ, WRITE = 1, 2
18 def U32(i):
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
20 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
21 """
22 if i < 0:
23 i += 1 << 32
24 return i
26 def LOWU32(i):
27 """Return the low-order 32 bits, as a non-negative int"""
28 return i & 0xFFFFFFFF
30 def write32u(output, value):
31 # The L format writes the bit pattern correctly whether signed
32 # or unsigned.
33 output.write(struct.pack("<L", value))
35 def read32(input):
36 return struct.unpack("<I", input.read(4))[0]
38 def open(filename, mode="rb", compresslevel=9):
39 """Shorthand for GzipFile(filename, mode, compresslevel).
41 The filename argument is required; mode defaults to 'rb'
42 and compresslevel defaults to 9.
44 """
45 return GzipFile(filename, mode, compresslevel)
47 class GzipFile:
48 """The GzipFile class simulates most of the methods of a file object with
49 the exception of the readinto() and truncate() methods.
51 """
53 myfileobj = None
54 max_read_chunk = 10 * 1024 * 1024 # 10Mb
56 def __init__(self, filename=None, mode=None,
57 compresslevel=9, fileobj=None):
58 """Constructor for the GzipFile class.
60 At least one of fileobj and filename must be given a
61 non-trivial value.
63 The new class instance is based on fileobj, which can be a regular
64 file, a StringIO object, or any other object which simulates a file.
65 It defaults to None, in which case filename is opened to provide
66 a file object.
68 When fileobj is not None, the filename argument is only used to be
69 included in the gzip file header, which may includes the original
70 filename of the uncompressed file. It defaults to the filename of
71 fileobj, if discernible; otherwise, it defaults to the empty string,
72 and in this case the original filename is not included in the header.
74 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
75 depending on whether the file will be read or written. The default
76 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
77 Be aware that only the 'rb', 'ab', and 'wb' values should be used
78 for cross-platform portability.
80 The compresslevel argument is an integer from 1 to 9 controlling the
81 level of compression; 1 is fastest and produces the least compression,
82 and 9 is slowest and produces the most compression. The default is 9.
84 """
86 # guarantee the file is opened in binary mode on platforms
87 # that care about that sort of thing
88 if mode and 'b' not in mode:
89 mode += 'b'
90 if fileobj is None:
91 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
92 if filename is None:
93 if hasattr(fileobj, 'name'): filename = fileobj.name
94 else: filename = ''
95 if mode is None:
96 if hasattr(fileobj, 'mode'): mode = fileobj.mode
97 else: mode = 'rb'
99 if mode[0:1] == 'r':
100 self.mode = READ
101 # Set flag indicating start of a new member
102 self._new_member = True
103 self.extrabuf = b""
104 self.extrasize = 0
105 self.name = filename
106 # Starts small, scales exponentially
107 self.min_readsize = 100
109 elif mode[0:1] == 'w' or mode[0:1] == 'a':
110 self.mode = WRITE
111 self._init_write(filename)
112 self.compress = zlib.compressobj(compresslevel,
113 zlib.DEFLATED,
114 -zlib.MAX_WBITS,
115 zlib.DEF_MEM_LEVEL,
117 else:
118 raise IOError("Mode " + mode + " not supported")
120 self.fileobj = fileobj
121 self.offset = 0
123 if self.mode == WRITE:
124 self._write_gzip_header()
126 @property
127 def filename(self):
128 import warnings
129 warnings.warn("use the name attribute", DeprecationWarning)
130 if self.mode == WRITE and self.name[-3:] != ".gz":
131 return self.name + ".gz"
132 return self.name
134 def __repr__(self):
135 s = repr(self.fileobj)
136 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
138 def _init_write(self, filename):
139 self.name = filename
140 self.crc = zlib.crc32("") & 0xffffffff
141 self.size = 0
142 self.writebuf = []
143 self.bufsize = 0
145 def _write_gzip_header(self):
146 self.fileobj.write(b'\037\213') # magic header
147 self.fileobj.write(b'\010') # compression method
148 try:
149 # RFC 1952 requires the FNAME field to be Latin-1. Do not
150 # include filenames that cannot be represented that way.
151 fname = self.name.encode('latin-1')
152 if fname.endswith(b'.gz'):
153 fname = fname[:-3]
154 except UnicodeEncodeError:
155 fname = b''
156 flags = 0
157 if fname:
158 flags = FNAME
159 self.fileobj.write(chr(flags).encode('latin-1'))
160 write32u(self.fileobj, int(time.time()))
161 self.fileobj.write(b'\002')
162 self.fileobj.write(b'\377')
163 if fname:
164 self.fileobj.write(fname + b'\000')
166 def _init_read(self):
167 self.crc = zlib.crc32("") & 0xffffffff
168 self.size = 0
170 def _read_gzip_header(self):
171 magic = self.fileobj.read(2)
172 if magic != b'\037\213':
173 raise IOError('Not a gzipped file')
174 method = ord( self.fileobj.read(1) )
175 if method != 8:
176 raise IOError('Unknown compression method')
177 flag = ord( self.fileobj.read(1) )
178 # modtime = self.fileobj.read(4)
179 # extraflag = self.fileobj.read(1)
180 # os = self.fileobj.read(1)
181 self.fileobj.read(6)
183 if flag & FEXTRA:
184 # Read & discard the extra field, if present
185 xlen = ord(self.fileobj.read(1))
186 xlen = xlen + 256*ord(self.fileobj.read(1))
187 self.fileobj.read(xlen)
188 if flag & FNAME:
189 # Read and discard a null-terminated string containing the filename
190 while True:
191 s = self.fileobj.read(1)
192 if not s or s==b'\000':
193 break
194 if flag & FCOMMENT:
195 # Read and discard a null-terminated string containing a comment
196 while True:
197 s = self.fileobj.read(1)
198 if not s or s==b'\000':
199 break
200 if flag & FHCRC:
201 self.fileobj.read(2) # Read & discard the 16-bit header CRC
204 def write(self,data):
205 if self.mode != WRITE:
206 import errno
207 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
209 if self.fileobj is None:
210 raise ValueError("write() on closed GzipFile object")
211 if len(data) > 0:
212 self.size = self.size + len(data)
213 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
214 self.fileobj.write( self.compress.compress(data) )
215 self.offset += len(data)
217 def read(self, size=-1):
218 if self.mode != READ:
219 import errno
220 raise IOError(errno.EBADF, "read() on write-only GzipFile object")
222 if self.extrasize <= 0 and self.fileobj is None:
223 return b''
225 readsize = 1024
226 if size < 0: # get the whole thing
227 try:
228 while True:
229 self._read(readsize)
230 readsize = min(self.max_read_chunk, readsize * 2)
231 except EOFError:
232 size = self.extrasize
233 else: # just get some more of it
234 try:
235 while size > self.extrasize:
236 self._read(readsize)
237 readsize = min(self.max_read_chunk, readsize * 2)
238 except EOFError:
239 if size > self.extrasize:
240 size = self.extrasize
242 chunk = self.extrabuf[:size]
243 self.extrabuf = self.extrabuf[size:]
244 self.extrasize = self.extrasize - size
246 self.offset += size
247 return chunk
249 def _unread(self, buf):
250 self.extrabuf = buf + self.extrabuf
251 self.extrasize = len(buf) + self.extrasize
252 self.offset -= len(buf)
254 def _read(self, size=1024):
255 if self.fileobj is None:
256 raise EOFError("Reached EOF")
258 if self._new_member:
259 # If the _new_member flag is set, we have to
260 # jump to the next member, if there is one.
262 # First, check if we're at the end of the file;
263 # if so, it's time to stop; no more members to read.
264 pos = self.fileobj.tell() # Save current position
265 self.fileobj.seek(0, 2) # Seek to end of file
266 if pos == self.fileobj.tell():
267 raise EOFError("Reached EOF")
268 else:
269 self.fileobj.seek( pos ) # Return to original position
271 self._init_read()
272 self._read_gzip_header()
273 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
274 self._new_member = False
276 # Read a chunk of data from the file
277 buf = self.fileobj.read(size)
279 # If the EOF has been reached, flush the decompression object
280 # and mark this object as finished.
282 if buf == b"":
283 uncompress = self.decompress.flush()
284 self._read_eof()
285 self._add_read_data( uncompress )
286 raise EOFError('Reached EOF')
288 uncompress = self.decompress.decompress(buf)
289 self._add_read_data( uncompress )
291 if self.decompress.unused_data != b"":
292 # Ending case: we've come to the end of a member in the file,
293 # so seek back to the start of the unused data, finish up
294 # this member, and read a new gzip header.
295 # (The number of bytes to seek back is the length of the unused
296 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
297 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
299 # Check the CRC and file size, and set the flag so we read
300 # a new member on the next call
301 self._read_eof()
302 self._new_member = True
304 def _add_read_data(self, data):
305 self.crc = zlib.crc32(data, self.crc) & 0xffffffff
306 self.extrabuf = self.extrabuf + data
307 self.extrasize = self.extrasize + len(data)
308 self.size = self.size + len(data)
310 def _read_eof(self):
311 # We've read to the end of the file, so we have to rewind in order
312 # to reread the 8 bytes containing the CRC and the file size.
313 # We check the that the computed CRC and size of the
314 # uncompressed data matches the stored values. Note that the size
315 # stored is the true file size mod 2**32.
316 self.fileobj.seek(-8, 1)
317 crc32 = read32(self.fileobj)
318 isize = read32(self.fileobj) # may exceed 2GB
319 if crc32 != self.crc:
320 raise IOError("CRC check failed %s != %s" % (hex(crc32),
321 hex(self.crc)))
322 elif isize != (self.size & 0xffffffff):
323 raise IOError("Incorrect length of data produced")
325 def close(self):
326 if self.mode == WRITE:
327 self.fileobj.write(self.compress.flush())
328 write32u(self.fileobj, self.crc)
329 # self.size may exceed 2GB, or even 4GB
330 write32u(self.fileobj, self.size & 0xffffffff)
331 self.fileobj = None
332 elif self.mode == READ:
333 self.fileobj = None
334 if self.myfileobj:
335 self.myfileobj.close()
336 self.myfileobj = None
338 def __del__(self):
339 try:
340 if (self.myfileobj is None and
341 self.fileobj is None):
342 return
343 except AttributeError:
344 return
345 self.close()
347 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
348 if self.mode == WRITE:
349 # Ensure the compressor's buffer is flushed
350 self.fileobj.write(self.compress.flush(zlib_mode))
351 self.fileobj.flush()
353 def fileno(self):
354 """Invoke the underlying file object's fileno() method.
356 This will raise AttributeError if the underlying file object
357 doesn't support fileno().
359 return self.fileobj.fileno()
361 def isatty(self):
362 return False
364 def tell(self):
365 return self.offset
367 def rewind(self):
368 '''Return the uncompressed stream file position indicator to the
369 beginning of the file'''
370 if self.mode != READ:
371 raise IOError("Can't rewind in write mode")
372 self.fileobj.seek(0)
373 self._new_member = True
374 self.extrabuf = b""
375 self.extrasize = 0
376 self.offset = 0
378 def seek(self, offset, whence=0):
379 if whence:
380 if whence == 1:
381 offset = self.offset + offset
382 else:
383 raise ValueError('Seek from end not supported')
384 if self.mode == WRITE:
385 if offset < self.offset:
386 raise IOError('Negative seek in write mode')
387 count = offset - self.offset
388 chunk = bytes(1024)
389 for i in range(count // 1024):
390 self.write(chunk)
391 self.write(bytes(count % 1024))
392 elif self.mode == READ:
393 if offset < self.offset:
394 # for negative seek, rewind and do positive seek
395 self.rewind()
396 count = offset - self.offset
397 for i in range(count // 1024):
398 self.read(1024)
399 self.read(count % 1024)
401 def readline(self, size=-1):
402 if size < 0:
403 size = sys.maxsize
404 readsize = self.min_readsize
405 else:
406 readsize = size
407 bufs = []
408 while size != 0:
409 c = self.read(readsize)
410 i = c.find(b'\n')
412 # We set i=size to break out of the loop under two
413 # conditions: 1) there's no newline, and the chunk is
414 # larger than size, or 2) there is a newline, but the
415 # resulting line would be longer than 'size'.
416 if (size <= i) or (i == -1 and len(c) > size):
417 i = size - 1
419 if i >= 0 or c == b'':
420 bufs.append(c[:i + 1]) # Add portion of last chunk
421 self._unread(c[i + 1:]) # Push back rest of chunk
422 break
424 # Append chunk to list, decrease 'size',
425 bufs.append(c)
426 size = size - len(c)
427 readsize = min(size, readsize * 2)
428 if readsize > self.min_readsize:
429 self.min_readsize = min(readsize, self.min_readsize * 2, 512)
430 return b''.join(bufs) # Return resulting line
432 def readlines(self, sizehint=0):
433 # Negative numbers result in reading all the lines
434 if sizehint <= 0:
435 sizehint = sys.maxsize
436 L = []
437 while sizehint > 0:
438 line = self.readline()
439 if line == b"":
440 break
441 L.append(line)
442 sizehint = sizehint - len(line)
444 return L
446 def writelines(self, L):
447 for line in L:
448 self.write(line)
450 def __iter__(self):
451 return self
453 def __next__(self):
454 line = self.readline()
455 if line:
456 return line
457 else:
458 raise StopIteration
461 def _test():
462 # Act like gzip; with -d, act like gunzip.
463 # The input file is not deleted, however, nor are any other gzip
464 # options or features supported.
465 args = sys.argv[1:]
466 decompress = args and args[0] == "-d"
467 if decompress:
468 args = args[1:]
469 if not args:
470 args = ["-"]
471 for arg in args:
472 if decompress:
473 if arg == "-":
474 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
475 g = sys.stdout
476 else:
477 if arg[-3:] != ".gz":
478 print("filename doesn't end in .gz:", repr(arg))
479 continue
480 f = open(arg, "rb")
481 g = builtins.open(arg[:-3], "wb")
482 else:
483 if arg == "-":
484 f = sys.stdin
485 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
486 else:
487 f = builtins.open(arg, "rb")
488 g = open(arg + ".gz", "wb")
489 while True:
490 chunk = f.read(1024)
491 if not chunk:
492 break
493 g.write(chunk)
494 if g is not sys.stdout:
495 g.close()
496 if f is not sys.stdin:
497 f.close()
499 if __name__ == '__main__':
500 _test()