append(): Fixing the test for convertability after consultation with
[python/dscho.git] / Lib / gzip.py
blob55d448dd1d7905c29f485eed62025612cf7b3afd
1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct, sys, time
9 import zlib
10 import __builtin__
12 __all__ = ["GzipFile","open"]
14 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16 READ, WRITE = 1, 2
18 def write32(output, value):
19 output.write(struct.pack("<l", value))
21 def write32u(output, value):
22 if value < 0:
23 value = value + 0x100000000L
24 output.write(struct.pack("<L", value))
26 def read32(input):
27 return struct.unpack("<l", input.read(4))[0]
29 def open(filename, mode="rb", compresslevel=9):
30 """Shorthand for GzipFile(filename, mode, compresslevel).
32 The filename argument is required; mode defaults to 'rb'
33 and compresslevel defaults to 9.
35 """
36 return GzipFile(filename, mode, compresslevel)
38 class GzipFile:
39 """The GzipFile class simulates most of the methods of a file object with
40 the exception of the readinto() and truncate() methods.
42 """
44 myfileobj = None
46 def __init__(self, filename=None, mode=None,
47 compresslevel=9, fileobj=None):
48 """Constructor for the GzipFile class.
50 At least one of fileobj and filename must be given a
51 non-trivial value.
53 The new class instance is based on fileobj, which can be a regular
54 file, a StringIO object, or any other object which simulates a file.
55 It defaults to None, in which case filename is opened to provide
56 a file object.
58 When fileobj is not None, the filename argument is only used to be
59 included in the gzip file header, which may includes the original
60 filename of the uncompressed file. It defaults to the filename of
61 fileobj, if discernible; otherwise, it defaults to the empty string,
62 and in this case the original filename is not included in the header.
64 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
65 depending on whether the file will be read or written. The default
66 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
67 Be aware that only the 'rb', 'ab', and 'wb' values should be used
68 for cross-platform portability.
70 The compresslevel argument is an integer from 1 to 9 controlling the
71 level of compression; 1 is fastest and produces the least compression,
72 and 9 is slowest and produces the most compression. The default is 9.
74 """
76 # guarantee the file is opened in binary mode on platforms
77 # that care about that sort of thing
78 if mode and 'b' not in mode:
79 mode += 'b'
80 if fileobj is None:
81 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
82 if filename is None:
83 if hasattr(fileobj, 'name'): filename = fileobj.name
84 else: filename = ''
85 if mode is None:
86 if hasattr(fileobj, 'mode'): mode = fileobj.mode
87 else: mode = 'rb'
89 if mode[0:1] == 'r':
90 self.mode = READ
91 # Set flag indicating start of a new member
92 self._new_member = True
93 self.extrabuf = ""
94 self.extrasize = 0
95 self.filename = filename
97 elif mode[0:1] == 'w' or mode[0:1] == 'a':
98 self.mode = WRITE
99 self._init_write(filename)
100 self.compress = zlib.compressobj(compresslevel,
101 zlib.DEFLATED,
102 -zlib.MAX_WBITS,
103 zlib.DEF_MEM_LEVEL,
105 else:
106 raise IOError, "Mode " + mode + " not supported"
108 self.fileobj = fileobj
109 self.offset = 0
111 if self.mode == WRITE:
112 self._write_gzip_header()
114 def __repr__(self):
115 s = repr(self.fileobj)
116 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
118 def _init_write(self, filename):
119 if filename[-3:] != '.gz':
120 filename = filename + '.gz'
121 self.filename = filename
122 self.crc = zlib.crc32("")
123 self.size = 0
124 self.writebuf = []
125 self.bufsize = 0
127 def _write_gzip_header(self):
128 self.fileobj.write('\037\213') # magic header
129 self.fileobj.write('\010') # compression method
130 fname = self.filename[:-3]
131 flags = 0
132 if fname:
133 flags = FNAME
134 self.fileobj.write(chr(flags))
135 write32u(self.fileobj, long(time.time()))
136 self.fileobj.write('\002')
137 self.fileobj.write('\377')
138 if fname:
139 self.fileobj.write(fname + '\000')
141 def _init_read(self):
142 self.crc = zlib.crc32("")
143 self.size = 0
145 def _read_gzip_header(self):
146 magic = self.fileobj.read(2)
147 if magic != '\037\213':
148 raise IOError, 'Not a gzipped file'
149 method = ord( self.fileobj.read(1) )
150 if method != 8:
151 raise IOError, 'Unknown compression method'
152 flag = ord( self.fileobj.read(1) )
153 # modtime = self.fileobj.read(4)
154 # extraflag = self.fileobj.read(1)
155 # os = self.fileobj.read(1)
156 self.fileobj.read(6)
158 if flag & FEXTRA:
159 # Read & discard the extra field, if present
160 xlen=ord(self.fileobj.read(1))
161 xlen=xlen+256*ord(self.fileobj.read(1))
162 self.fileobj.read(xlen)
163 if flag & FNAME:
164 # Read and discard a null-terminated string containing the filename
165 while True:
166 s=self.fileobj.read(1)
167 if not s or s=='\000': break
168 if flag & FCOMMENT:
169 # Read and discard a null-terminated string containing a comment
170 while True:
171 s=self.fileobj.read(1)
172 if not s or s=='\000': break
173 if flag & FHCRC:
174 self.fileobj.read(2) # Read & discard the 16-bit header CRC
177 def write(self,data):
178 if self.mode != WRITE:
179 import errno
180 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
182 if self.fileobj is None:
183 raise ValueError, "write() on closed GzipFile object"
184 if len(data) > 0:
185 self.size = self.size + len(data)
186 self.crc = zlib.crc32(data, self.crc)
187 self.fileobj.write( self.compress.compress(data) )
188 self.offset += len(data)
190 def read(self, size=-1):
191 if self.mode != READ:
192 import errno
193 raise IOError(errno.EBADF, "write() on read-only GzipFile object")
195 if self.extrasize <= 0 and self.fileobj is None:
196 return ''
198 readsize = 1024
199 if size < 0: # get the whole thing
200 try:
201 while True:
202 self._read(readsize)
203 readsize = readsize * 2
204 except EOFError:
205 size = self.extrasize
206 else: # just get some more of it
207 try:
208 while size > self.extrasize:
209 self._read(readsize)
210 readsize = readsize * 2
211 except EOFError:
212 if size > self.extrasize:
213 size = self.extrasize
215 chunk = self.extrabuf[:size]
216 self.extrabuf = self.extrabuf[size:]
217 self.extrasize = self.extrasize - size
219 self.offset += size
220 return chunk
222 def _unread(self, buf):
223 self.extrabuf = buf + self.extrabuf
224 self.extrasize = len(buf) + self.extrasize
225 self.offset -= len(buf)
227 def _read(self, size=1024):
228 if self.fileobj is None: raise EOFError, "Reached EOF"
230 if self._new_member:
231 # If the _new_member flag is set, we have to
232 # jump to the next member, if there is one.
234 # First, check if we're at the end of the file;
235 # if so, it's time to stop; no more members to read.
236 pos = self.fileobj.tell() # Save current position
237 self.fileobj.seek(0, 2) # Seek to end of file
238 if pos == self.fileobj.tell():
239 raise EOFError, "Reached EOF"
240 else:
241 self.fileobj.seek( pos ) # Return to original position
243 self._init_read()
244 self._read_gzip_header()
245 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
246 self._new_member = False
248 # Read a chunk of data from the file
249 buf = self.fileobj.read(size)
251 # If the EOF has been reached, flush the decompression object
252 # and mark this object as finished.
254 if buf == "":
255 uncompress = self.decompress.flush()
256 self._read_eof()
257 self._add_read_data( uncompress )
258 raise EOFError, 'Reached EOF'
260 uncompress = self.decompress.decompress(buf)
261 self._add_read_data( uncompress )
263 if self.decompress.unused_data != "":
264 # Ending case: we've come to the end of a member in the file,
265 # so seek back to the start of the unused data, finish up
266 # this member, and read a new gzip header.
267 # (The number of bytes to seek back is the length of the unused
268 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
269 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
271 # Check the CRC and file size, and set the flag so we read
272 # a new member on the next call
273 self._read_eof()
274 self._new_member = True
276 def _add_read_data(self, data):
277 self.crc = zlib.crc32(data, self.crc)
278 self.extrabuf = self.extrabuf + data
279 self.extrasize = self.extrasize + len(data)
280 self.size = self.size + len(data)
282 def _read_eof(self):
283 # We've read to the end of the file, so we have to rewind in order
284 # to reread the 8 bytes containing the CRC and the file size.
285 # We check the that the computed CRC and size of the
286 # uncompressed data matches the stored values.
287 self.fileobj.seek(-8, 1)
288 crc32 = read32(self.fileobj)
289 isize = read32(self.fileobj)
290 if crc32%0x100000000L != self.crc%0x100000000L:
291 raise ValueError, "CRC check failed"
292 elif isize != self.size:
293 raise ValueError, "Incorrect length of data produced"
295 def close(self):
296 if self.mode == WRITE:
297 self.fileobj.write(self.compress.flush())
298 write32(self.fileobj, self.crc)
299 write32(self.fileobj, self.size)
300 self.fileobj = None
301 elif self.mode == READ:
302 self.fileobj = None
303 if self.myfileobj:
304 self.myfileobj.close()
305 self.myfileobj = None
307 def __del__(self):
308 try:
309 if (self.myfileobj is None and
310 self.fileobj is None):
311 return
312 except AttributeError:
313 return
314 self.close()
316 def flush(self):
317 self.fileobj.flush()
319 def isatty(self):
320 return False
322 def tell(self):
323 return self.offset
325 def rewind(self):
326 '''Return the uncompressed stream file position indicator to the
327 beginning of the file'''
328 if self.mode != READ:
329 raise IOError("Can't rewind in write mode")
330 self.fileobj.seek(0)
331 self._new_member = True
332 self.extrabuf = ""
333 self.extrasize = 0
334 self.offset = 0
336 def seek(self, offset):
337 if self.mode == WRITE:
338 if offset < self.offset:
339 raise IOError('Negative seek in write mode')
340 count = offset - self.offset
341 for i in range(count/1024):
342 self.write(1024*'\0')
343 self.write((count%1024)*'\0')
344 elif self.mode == READ:
345 if offset < self.offset:
346 # for negative seek, rewind and do positive seek
347 self.rewind()
348 count = offset - self.offset
349 for i in range(count/1024): self.read(1024)
350 self.read(count % 1024)
352 def readline(self, size=-1):
353 if size < 0: size = sys.maxint
354 bufs = []
355 readsize = min(100, size) # Read from the file in small chunks
356 while True:
357 if size == 0:
358 return "".join(bufs) # Return resulting line
360 c = self.read(readsize)
361 i = c.find('\n')
362 if size is not None:
363 # We set i=size to break out of the loop under two
364 # conditions: 1) there's no newline, and the chunk is
365 # larger than size, or 2) there is a newline, but the
366 # resulting line would be longer than 'size'.
367 if i==-1 and len(c) > size: i=size-1
368 elif size <= i: i = size -1
370 if i >= 0 or c == '':
371 bufs.append(c[:i+1]) # Add portion of last chunk
372 self._unread(c[i+1:]) # Push back rest of chunk
373 return ''.join(bufs) # Return resulting line
375 # Append chunk to list, decrease 'size',
376 bufs.append(c)
377 size = size - len(c)
378 readsize = min(size, readsize * 2)
380 def readlines(self, sizehint=0):
381 # Negative numbers result in reading all the lines
382 if sizehint <= 0: sizehint = sys.maxint
383 L = []
384 while sizehint > 0:
385 line = self.readline()
386 if line == "": break
387 L.append(line)
388 sizehint = sizehint - len(line)
390 return L
392 def writelines(self, L):
393 for line in L:
394 self.write(line)
396 def __iter__(self):
397 return self
399 def next(self):
400 line = self.readline()
401 if line:
402 return line
403 else:
404 raise StopIteration
407 def _test():
408 # Act like gzip; with -d, act like gunzip.
409 # The input file is not deleted, however, nor are any other gzip
410 # options or features supported.
411 args = sys.argv[1:]
412 decompress = args and args[0] == "-d"
413 if decompress:
414 args = args[1:]
415 if not args:
416 args = ["-"]
417 for arg in args:
418 if decompress:
419 if arg == "-":
420 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
421 g = sys.stdout
422 else:
423 if arg[-3:] != ".gz":
424 print "filename doesn't end in .gz:", `arg`
425 continue
426 f = open(arg, "rb")
427 g = __builtin__.open(arg[:-3], "wb")
428 else:
429 if arg == "-":
430 f = sys.stdin
431 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
432 else:
433 f = __builtin__.open(arg, "rb")
434 g = open(arg + ".gz", "wb")
435 while True:
436 chunk = f.read(1024)
437 if not chunk:
438 break
439 g.write(chunk)
440 if g is not sys.stdout:
441 g.close()
442 if f is not sys.stdin:
443 f.close()
445 if __name__ == '__main__':
446 _test()