1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
19 """Return i as an unsigned integer, assuming it fits in 32 bits.
20 If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
27 """Return the low-order 32 bits, as a non-negative int"""
30 def write32u(output
, value
):
31 # The L format writes the bit pattern correctly whether signed
33 output
.write(struct
.pack("<L", value
))
36 return struct
.unpack("<I", input.read(4))[0]
38 def open(filename
, mode
="rb", compresslevel
=9):
39 """Shorthand for GzipFile(filename, mode, compresslevel).
41 The filename argument is required; mode defaults to 'rb'
42 and compresslevel defaults to 9.
45 return GzipFile(filename
, mode
, compresslevel
)
48 """The GzipFile class simulates most of the methods of a file object with
49 the exception of the readinto() and truncate() methods.
54 max_read_chunk
= 10 * 1024 * 1024 # 10Mb
56 def __init__(self
, filename
=None, mode
=None,
57 compresslevel
=9, fileobj
=None):
58 """Constructor for the GzipFile class.
60 At least one of fileobj and filename must be given a
63 The new class instance is based on fileobj, which can be a regular
64 file, a StringIO object, or any other object which simulates a file.
65 It defaults to None, in which case filename is opened to provide
68 When fileobj is not None, the filename argument is only used to be
69 included in the gzip file header, which may includes the original
70 filename of the uncompressed file. It defaults to the filename of
71 fileobj, if discernible; otherwise, it defaults to the empty string,
72 and in this case the original filename is not included in the header.
74 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
75 depending on whether the file will be read or written. The default
76 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
77 Be aware that only the 'rb', 'ab', and 'wb' values should be used
78 for cross-platform portability.
80 The compresslevel argument is an integer from 1 to 9 controlling the
81 level of compression; 1 is fastest and produces the least compression,
82 and 9 is slowest and produces the most compression. The default is 9.
86 # guarantee the file is opened in binary mode on platforms
87 # that care about that sort of thing
88 if mode
and 'b' not in mode
:
91 fileobj
= self
.myfileobj
= builtins
.open(filename
, mode
or 'rb')
93 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
96 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
101 # Set flag indicating start of a new member
102 self
._new
_member
= True
106 # Starts small, scales exponentially
107 self
.min_readsize
= 100
109 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
111 self
._init
_write
(filename
)
112 self
.compress
= zlib
.compressobj(compresslevel
,
118 raise IOError("Mode " + mode
+ " not supported")
120 self
.fileobj
= fileobj
123 if self
.mode
== WRITE
:
124 self
._write
_gzip
_header
()
129 warnings
.warn("use the name attribute", DeprecationWarning)
130 if self
.mode
== WRITE
and self
.name
[-3:] != ".gz":
131 return self
.name
+ ".gz"
135 s
= repr(self
.fileobj
)
136 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
138 def _init_write(self
, filename
):
140 self
.crc
= zlib
.crc32("") & 0xffffffff
145 def _write_gzip_header(self
):
146 self
.fileobj
.write(b
'\037\213') # magic header
147 self
.fileobj
.write(b
'\010') # compression method
149 # RFC 1952 requires the FNAME field to be Latin-1. Do not
150 # include filenames that cannot be represented that way.
151 fname
= self
.name
.encode('latin-1')
152 if fname
.endswith(b
'.gz'):
154 except UnicodeEncodeError:
159 self
.fileobj
.write(chr(flags
).encode('latin-1'))
160 write32u(self
.fileobj
, int(time
.time()))
161 self
.fileobj
.write(b
'\002')
162 self
.fileobj
.write(b
'\377')
164 self
.fileobj
.write(fname
+ b
'\000')
166 def _init_read(self
):
167 self
.crc
= zlib
.crc32("") & 0xffffffff
170 def _read_gzip_header(self
):
171 magic
= self
.fileobj
.read(2)
172 if magic
!= b
'\037\213':
173 raise IOError('Not a gzipped file')
174 method
= ord( self
.fileobj
.read(1) )
176 raise IOError('Unknown compression method')
177 flag
= ord( self
.fileobj
.read(1) )
178 # modtime = self.fileobj.read(4)
179 # extraflag = self.fileobj.read(1)
180 # os = self.fileobj.read(1)
184 # Read & discard the extra field, if present
185 xlen
= ord(self
.fileobj
.read(1))
186 xlen
= xlen
+ 256*ord(self
.fileobj
.read(1))
187 self
.fileobj
.read(xlen
)
189 # Read and discard a null-terminated string containing the filename
191 s
= self
.fileobj
.read(1)
192 if not s
or s
==b
'\000':
195 # Read and discard a null-terminated string containing a comment
197 s
= self
.fileobj
.read(1)
198 if not s
or s
==b
'\000':
201 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
204 def write(self
,data
):
205 if self
.mode
!= WRITE
:
207 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
209 if self
.fileobj
is None:
210 raise ValueError("write() on closed GzipFile object")
212 self
.size
= self
.size
+ len(data
)
213 self
.crc
= zlib
.crc32(data
, self
.crc
) & 0xffffffff
214 self
.fileobj
.write( self
.compress
.compress(data
) )
215 self
.offset
+= len(data
)
217 def read(self
, size
=-1):
218 if self
.mode
!= READ
:
220 raise IOError(errno
.EBADF
, "read() on write-only GzipFile object")
222 if self
.extrasize
<= 0 and self
.fileobj
is None:
226 if size
< 0: # get the whole thing
230 readsize
= min(self
.max_read_chunk
, readsize
* 2)
232 size
= self
.extrasize
233 else: # just get some more of it
235 while size
> self
.extrasize
:
237 readsize
= min(self
.max_read_chunk
, readsize
* 2)
239 if size
> self
.extrasize
:
240 size
= self
.extrasize
242 chunk
= self
.extrabuf
[:size
]
243 self
.extrabuf
= self
.extrabuf
[size
:]
244 self
.extrasize
= self
.extrasize
- size
249 def _unread(self
, buf
):
250 self
.extrabuf
= buf
+ self
.extrabuf
251 self
.extrasize
= len(buf
) + self
.extrasize
252 self
.offset
-= len(buf
)
254 def _read(self
, size
=1024):
255 if self
.fileobj
is None:
256 raise EOFError("Reached EOF")
259 # If the _new_member flag is set, we have to
260 # jump to the next member, if there is one.
262 # First, check if we're at the end of the file;
263 # if so, it's time to stop; no more members to read.
264 pos
= self
.fileobj
.tell() # Save current position
265 self
.fileobj
.seek(0, 2) # Seek to end of file
266 if pos
== self
.fileobj
.tell():
267 raise EOFError("Reached EOF")
269 self
.fileobj
.seek( pos
) # Return to original position
272 self
._read
_gzip
_header
()
273 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
274 self
._new
_member
= False
276 # Read a chunk of data from the file
277 buf
= self
.fileobj
.read(size
)
279 # If the EOF has been reached, flush the decompression object
280 # and mark this object as finished.
283 uncompress
= self
.decompress
.flush()
285 self
._add
_read
_data
( uncompress
)
286 raise EOFError('Reached EOF')
288 uncompress
= self
.decompress
.decompress(buf
)
289 self
._add
_read
_data
( uncompress
)
291 if self
.decompress
.unused_data
!= b
"":
292 # Ending case: we've come to the end of a member in the file,
293 # so seek back to the start of the unused data, finish up
294 # this member, and read a new gzip header.
295 # (The number of bytes to seek back is the length of the unused
296 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
297 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
299 # Check the CRC and file size, and set the flag so we read
300 # a new member on the next call
302 self
._new
_member
= True
304 def _add_read_data(self
, data
):
305 self
.crc
= zlib
.crc32(data
, self
.crc
) & 0xffffffff
306 self
.extrabuf
= self
.extrabuf
+ data
307 self
.extrasize
= self
.extrasize
+ len(data
)
308 self
.size
= self
.size
+ len(data
)
311 # We've read to the end of the file, so we have to rewind in order
312 # to reread the 8 bytes containing the CRC and the file size.
313 # We check the that the computed CRC and size of the
314 # uncompressed data matches the stored values. Note that the size
315 # stored is the true file size mod 2**32.
316 self
.fileobj
.seek(-8, 1)
317 crc32
= read32(self
.fileobj
)
318 isize
= read32(self
.fileobj
) # may exceed 2GB
319 if crc32
!= self
.crc
:
320 raise IOError("CRC check failed %s != %s" % (hex(crc32
),
322 elif isize
!= (self
.size
& 0xffffffff):
323 raise IOError("Incorrect length of data produced")
326 if self
.mode
== WRITE
:
327 self
.fileobj
.write(self
.compress
.flush())
328 write32u(self
.fileobj
, self
.crc
)
329 # self.size may exceed 2GB, or even 4GB
330 write32u(self
.fileobj
, self
.size
& 0xffffffff)
332 elif self
.mode
== READ
:
335 self
.myfileobj
.close()
336 self
.myfileobj
= None
340 if (self
.myfileobj
is None and
341 self
.fileobj
is None):
343 except AttributeError:
347 def flush(self
,zlib_mode
=zlib
.Z_SYNC_FLUSH
):
348 if self
.mode
== WRITE
:
349 # Ensure the compressor's buffer is flushed
350 self
.fileobj
.write(self
.compress
.flush(zlib_mode
))
354 """Invoke the underlying file object's fileno() method.
356 This will raise AttributeError if the underlying file object
357 doesn't support fileno().
359 return self
.fileobj
.fileno()
368 '''Return the uncompressed stream file position indicator to the
369 beginning of the file'''
370 if self
.mode
!= READ
:
371 raise IOError("Can't rewind in write mode")
373 self
._new
_member
= True
378 def seek(self
, offset
, whence
=0):
381 offset
= self
.offset
+ offset
383 raise ValueError('Seek from end not supported')
384 if self
.mode
== WRITE
:
385 if offset
< self
.offset
:
386 raise IOError('Negative seek in write mode')
387 count
= offset
- self
.offset
389 for i
in range(count
// 1024):
391 self
.write(bytes(count
% 1024))
392 elif self
.mode
== READ
:
393 if offset
< self
.offset
:
394 # for negative seek, rewind and do positive seek
396 count
= offset
- self
.offset
397 for i
in range(count
// 1024):
399 self
.read(count
% 1024)
401 def readline(self
, size
=-1):
404 readsize
= self
.min_readsize
409 c
= self
.read(readsize
)
412 # We set i=size to break out of the loop under two
413 # conditions: 1) there's no newline, and the chunk is
414 # larger than size, or 2) there is a newline, but the
415 # resulting line would be longer than 'size'.
416 if (size
<= i
) or (i
== -1 and len(c
) > size
):
419 if i
>= 0 or c
== b
'':
420 bufs
.append(c
[:i
+ 1]) # Add portion of last chunk
421 self
._unread
(c
[i
+ 1:]) # Push back rest of chunk
424 # Append chunk to list, decrease 'size',
427 readsize
= min(size
, readsize
* 2)
428 if readsize
> self
.min_readsize
:
429 self
.min_readsize
= min(readsize
, self
.min_readsize
* 2, 512)
430 return b
''.join(bufs
) # Return resulting line
432 def readlines(self
, sizehint
=0):
433 # Negative numbers result in reading all the lines
435 sizehint
= sys
.maxsize
438 line
= self
.readline()
442 sizehint
= sizehint
- len(line
)
446 def writelines(self
, L
):
454 line
= self
.readline()
462 # Act like gzip; with -d, act like gunzip.
463 # The input file is not deleted, however, nor are any other gzip
464 # options or features supported.
466 decompress
= args
and args
[0] == "-d"
474 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
477 if arg
[-3:] != ".gz":
478 print("filename doesn't end in .gz:", repr(arg
))
481 g
= builtins
.open(arg
[:-3], "wb")
485 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
487 f
= builtins
.open(arg
, "rb")
488 g
= open(arg
+ ".gz", "wb")
494 if g
is not sys
.stdout
:
496 if f
is not sys
.stdin
:
499 if __name__
== '__main__':