1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
21 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
28 """Return the low-order 32 bits of an int, as a non-negative int."""
29 return i
& 0xFFFFFFFFL
31 def write32(output
, value
):
32 output
.write(struct
.pack("<l", value
))
34 def write32u(output
, value
):
35 # The L format writes the bit pattern correctly whether signed
37 output
.write(struct
.pack("<L", value
))
40 return struct
.unpack("<l", input.read(4))[0]
42 def open(filename
, mode
="rb", compresslevel
=9):
43 """Shorthand for GzipFile(filename, mode, compresslevel).
45 The filename argument is required; mode defaults to 'rb'
46 and compresslevel defaults to 9.
49 return GzipFile(filename
, mode
, compresslevel
)
52 """The GzipFile class simulates most of the methods of a file object with
53 the exception of the readinto() and truncate() methods.
59 def __init__(self
, filename
=None, mode
=None,
60 compresslevel
=9, fileobj
=None):
61 """Constructor for the GzipFile class.
63 At least one of fileobj and filename must be given a
66 The new class instance is based on fileobj, which can be a regular
67 file, a StringIO object, or any other object which simulates a file.
68 It defaults to None, in which case filename is opened to provide
71 When fileobj is not None, the filename argument is only used to be
72 included in the gzip file header, which may includes the original
73 filename of the uncompressed file. It defaults to the filename of
74 fileobj, if discernible; otherwise, it defaults to the empty string,
75 and in this case the original filename is not included in the header.
77 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
78 depending on whether the file will be read or written. The default
79 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
80 Be aware that only the 'rb', 'ab', and 'wb' values should be used
81 for cross-platform portability.
83 The compresslevel argument is an integer from 1 to 9 controlling the
84 level of compression; 1 is fastest and produces the least compression,
85 and 9 is slowest and produces the most compression. The default is 9.
89 # guarantee the file is opened in binary mode on platforms
90 # that care about that sort of thing
91 if mode
and 'b' not in mode
:
94 fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
96 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
99 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
104 # Set flag indicating start of a new member
105 self
._new
_member
= True
108 self
.filename
= filename
110 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
112 self
._init
_write
(filename
)
113 self
.compress
= zlib
.compressobj(compresslevel
,
119 raise IOError, "Mode " + mode
+ " not supported"
121 self
.fileobj
= fileobj
124 if self
.mode
== WRITE
:
125 self
._write
_gzip
_header
()
128 s
= repr(self
.fileobj
)
129 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
131 def _init_write(self
, filename
):
132 if filename
[-3:] != '.gz':
133 filename
= filename
+ '.gz'
134 self
.filename
= filename
135 self
.crc
= zlib
.crc32("")
140 def _write_gzip_header(self
):
141 self
.fileobj
.write('\037\213') # magic header
142 self
.fileobj
.write('\010') # compression method
143 fname
= self
.filename
[:-3]
147 self
.fileobj
.write(chr(flags
))
148 write32u(self
.fileobj
, long(time
.time()))
149 self
.fileobj
.write('\002')
150 self
.fileobj
.write('\377')
152 self
.fileobj
.write(fname
+ '\000')
154 def _init_read(self
):
155 self
.crc
= zlib
.crc32("")
158 def _read_gzip_header(self
):
159 magic
= self
.fileobj
.read(2)
160 if magic
!= '\037\213':
161 raise IOError, 'Not a gzipped file'
162 method
= ord( self
.fileobj
.read(1) )
164 raise IOError, 'Unknown compression method'
165 flag
= ord( self
.fileobj
.read(1) )
166 # modtime = self.fileobj.read(4)
167 # extraflag = self.fileobj.read(1)
168 # os = self.fileobj.read(1)
172 # Read & discard the extra field, if present
173 xlen
= ord(self
.fileobj
.read(1))
174 xlen
= xlen
+ 256*ord(self
.fileobj
.read(1))
175 self
.fileobj
.read(xlen
)
177 # Read and discard a null-terminated string containing the filename
179 s
= self
.fileobj
.read(1)
180 if not s
or s
=='\000':
183 # Read and discard a null-terminated string containing a comment
185 s
= self
.fileobj
.read(1)
186 if not s
or s
=='\000':
189 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
192 def write(self
,data
):
193 if self
.mode
!= WRITE
:
195 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
197 if self
.fileobj
is None:
198 raise ValueError, "write() on closed GzipFile object"
200 self
.size
= self
.size
+ len(data
)
201 self
.crc
= zlib
.crc32(data
, self
.crc
)
202 self
.fileobj
.write( self
.compress
.compress(data
) )
203 self
.offset
+= len(data
)
205 def read(self
, size
=-1):
206 if self
.mode
!= READ
:
208 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
210 if self
.extrasize
<= 0 and self
.fileobj
is None:
214 if size
< 0: # get the whole thing
218 readsize
= readsize
* 2
220 size
= self
.extrasize
221 else: # just get some more of it
223 while size
> self
.extrasize
:
225 readsize
= readsize
* 2
227 if size
> self
.extrasize
:
228 size
= self
.extrasize
230 chunk
= self
.extrabuf
[:size
]
231 self
.extrabuf
= self
.extrabuf
[size
:]
232 self
.extrasize
= self
.extrasize
- size
237 def _unread(self
, buf
):
238 self
.extrabuf
= buf
+ self
.extrabuf
239 self
.extrasize
= len(buf
) + self
.extrasize
240 self
.offset
-= len(buf
)
242 def _read(self
, size
=1024):
243 if self
.fileobj
is None:
244 raise EOFError, "Reached EOF"
247 # If the _new_member flag is set, we have to
248 # jump to the next member, if there is one.
250 # First, check if we're at the end of the file;
251 # if so, it's time to stop; no more members to read.
252 pos
= self
.fileobj
.tell() # Save current position
253 self
.fileobj
.seek(0, 2) # Seek to end of file
254 if pos
== self
.fileobj
.tell():
255 raise EOFError, "Reached EOF"
257 self
.fileobj
.seek( pos
) # Return to original position
260 self
._read
_gzip
_header
()
261 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
262 self
._new
_member
= False
264 # Read a chunk of data from the file
265 buf
= self
.fileobj
.read(size
)
267 # If the EOF has been reached, flush the decompression object
268 # and mark this object as finished.
271 uncompress
= self
.decompress
.flush()
273 self
._add
_read
_data
( uncompress
)
274 raise EOFError, 'Reached EOF'
276 uncompress
= self
.decompress
.decompress(buf
)
277 self
._add
_read
_data
( uncompress
)
279 if self
.decompress
.unused_data
!= "":
280 # Ending case: we've come to the end of a member in the file,
281 # so seek back to the start of the unused data, finish up
282 # this member, and read a new gzip header.
283 # (The number of bytes to seek back is the length of the unused
284 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
285 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
287 # Check the CRC and file size, and set the flag so we read
288 # a new member on the next call
290 self
._new
_member
= True
292 def _add_read_data(self
, data
):
293 self
.crc
= zlib
.crc32(data
, self
.crc
)
294 self
.extrabuf
= self
.extrabuf
+ data
295 self
.extrasize
= self
.extrasize
+ len(data
)
296 self
.size
= self
.size
+ len(data
)
299 # We've read to the end of the file, so we have to rewind in order
300 # to reread the 8 bytes containing the CRC and the file size.
301 # We check the that the computed CRC and size of the
302 # uncompressed data matches the stored values. Note that the size
303 # stored is the true file size mod 2**32.
304 self
.fileobj
.seek(-8, 1)
305 crc32
= read32(self
.fileobj
)
306 isize
= U32(read32(self
.fileobj
)) # may exceed 2GB
307 if U32(crc32
) != U32(self
.crc
):
308 raise IOError, "CRC check failed"
309 elif isize
!= LOWU32(self
.size
):
310 raise IOError, "Incorrect length of data produced"
313 if self
.mode
== WRITE
:
314 self
.fileobj
.write(self
.compress
.flush())
315 write32(self
.fileobj
, self
.crc
)
316 # self.size may exceed 2GB, or even 4GB
317 write32u(self
.fileobj
, LOWU32(self
.size
))
319 elif self
.mode
== READ
:
322 self
.myfileobj
.close()
323 self
.myfileobj
= None
327 if (self
.myfileobj
is None and
328 self
.fileobj
is None):
330 except AttributeError:
344 '''Return the uncompressed stream file position indicator to the
345 beginning of the file'''
346 if self
.mode
!= READ
:
347 raise IOError("Can't rewind in write mode")
349 self
._new
_member
= True
354 def seek(self
, offset
):
355 if self
.mode
== WRITE
:
356 if offset
< self
.offset
:
357 raise IOError('Negative seek in write mode')
358 count
= offset
- self
.offset
359 for i
in range(count
// 1024):
360 self
.write(1024 * '\0')
361 self
.write((count
% 1024) * '\0')
362 elif self
.mode
== READ
:
363 if offset
< self
.offset
:
364 # for negative seek, rewind and do positive seek
366 count
= offset
- self
.offset
367 for i
in range(count
// 1024):
369 self
.read(count
% 1024)
371 def readline(self
, size
=-1):
372 if size
< 0: size
= sys
.maxint
374 readsize
= min(100, size
) # Read from the file in small chunks
377 return "".join(bufs
) # Return resulting line
379 c
= self
.read(readsize
)
382 # We set i=size to break out of the loop under two
383 # conditions: 1) there's no newline, and the chunk is
384 # larger than size, or 2) there is a newline, but the
385 # resulting line would be longer than 'size'.
386 if i
==-1 and len(c
) > size
: i
=size
-1
387 elif size
<= i
: i
= size
-1
389 if i
>= 0 or c
== '':
390 bufs
.append(c
[:i
+1]) # Add portion of last chunk
391 self
._unread
(c
[i
+1:]) # Push back rest of chunk
392 return ''.join(bufs
) # Return resulting line
394 # Append chunk to list, decrease 'size',
397 readsize
= min(size
, readsize
* 2)
399 def readlines(self
, sizehint
=0):
400 # Negative numbers result in reading all the lines
402 sizehint
= sys
.maxint
405 line
= self
.readline()
409 sizehint
= sizehint
- len(line
)
413 def writelines(self
, L
):
421 line
= self
.readline()
429 # Act like gzip; with -d, act like gunzip.
430 # The input file is not deleted, however, nor are any other gzip
431 # options or features supported.
433 decompress
= args
and args
[0] == "-d"
441 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
444 if arg
[-3:] != ".gz":
445 print "filename doesn't end in .gz:", `arg`
448 g
= __builtin__
.open(arg
[:-3], "wb")
452 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
454 f
= __builtin__
.open(arg
, "rb")
455 g
= open(arg
+ ".gz", "wb")
461 if g
is not sys
.stdout
:
463 if f
is not sys
.stdin
:
466 if __name__
== '__main__':