1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
18 def write32(output
, value
):
19 output
.write(struct
.pack("<l", value
))
21 def write32u(output
, value
):
23 value
= value
+ 0x100000000L
24 output
.write(struct
.pack("<L", value
))
27 return struct
.unpack("<l", input.read(4))[0]
29 def open(filename
, mode
="rb", compresslevel
=9):
30 """Shorthand for GzipFile(filename, mode, compresslevel).
32 The filename argument is required; mode defaults to 'rb'
33 and compresslevel defaults to 9.
36 return GzipFile(filename
, mode
, compresslevel
)
39 """The GzipFile class simulates most of the methods of a file object with
40 the exception of the readinto() and truncate() methods.
46 def __init__(self
, filename
=None, mode
=None,
47 compresslevel
=9, fileobj
=None):
48 """Constructor for the GzipFile class.
50 At least one of fileobj and filename must be given a
53 The new class instance is based on fileobj, which can be a regular
54 file, a StringIO object, or any other object which simulates a file.
55 It defaults to None, in which case filename is opened to provide
58 When fileobj is not None, the filename argument is only used to be
59 included in the gzip file header, which may includes the original
60 filename of the uncompressed file. It defaults to the filename of
61 fileobj, if discernible; otherwise, it defaults to the empty string,
62 and in this case the original filename is not included in the header.
64 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
65 depending on whether the file will be read or written. The default
66 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
67 Be aware that only the 'rb', 'ab', and 'wb' values should be used
68 for cross-platform portability.
70 The compresslevel argument is an integer from 1 to 9 controlling the
71 level of compression; 1 is fastest and produces the least compression,
72 and 9 is slowest and produces the most compression. The default is 9.
76 # guarantee the file is opened in binary mode on platforms
77 # that care about that sort of thing
78 if mode
and 'b' not in mode
:
81 fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
83 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
86 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
91 # Set flag indicating start of a new member
92 self
._new
_member
= True
95 self
.filename
= filename
97 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
99 self
._init
_write
(filename
)
100 self
.compress
= zlib
.compressobj(compresslevel
,
106 raise IOError, "Mode " + mode
+ " not supported"
108 self
.fileobj
= fileobj
111 if self
.mode
== WRITE
:
112 self
._write
_gzip
_header
()
115 s
= repr(self
.fileobj
)
116 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
118 def _init_write(self
, filename
):
119 if filename
[-3:] != '.gz':
120 filename
= filename
+ '.gz'
121 self
.filename
= filename
122 self
.crc
= zlib
.crc32("")
127 def _write_gzip_header(self
):
128 self
.fileobj
.write('\037\213') # magic header
129 self
.fileobj
.write('\010') # compression method
130 fname
= self
.filename
[:-3]
134 self
.fileobj
.write(chr(flags
))
135 write32u(self
.fileobj
, long(time
.time()))
136 self
.fileobj
.write('\002')
137 self
.fileobj
.write('\377')
139 self
.fileobj
.write(fname
+ '\000')
141 def _init_read(self
):
142 self
.crc
= zlib
.crc32("")
145 def _read_gzip_header(self
):
146 magic
= self
.fileobj
.read(2)
147 if magic
!= '\037\213':
148 raise IOError, 'Not a gzipped file'
149 method
= ord( self
.fileobj
.read(1) )
151 raise IOError, 'Unknown compression method'
152 flag
= ord( self
.fileobj
.read(1) )
153 # modtime = self.fileobj.read(4)
154 # extraflag = self.fileobj.read(1)
155 # os = self.fileobj.read(1)
159 # Read & discard the extra field, if present
160 xlen
=ord(self
.fileobj
.read(1))
161 xlen
=xlen
+256*ord(self
.fileobj
.read(1))
162 self
.fileobj
.read(xlen
)
164 # Read and discard a null-terminated string containing the filename
166 s
=self
.fileobj
.read(1)
167 if not s
or s
=='\000': break
169 # Read and discard a null-terminated string containing a comment
171 s
=self
.fileobj
.read(1)
172 if not s
or s
=='\000': break
174 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
177 def write(self
,data
):
178 if self
.mode
!= WRITE
:
180 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
182 if self
.fileobj
is None:
183 raise ValueError, "write() on closed GzipFile object"
185 self
.size
= self
.size
+ len(data
)
186 self
.crc
= zlib
.crc32(data
, self
.crc
)
187 self
.fileobj
.write( self
.compress
.compress(data
) )
188 self
.offset
+= len(data
)
190 def read(self
, size
=-1):
191 if self
.mode
!= READ
:
193 raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
195 if self
.extrasize
<= 0 and self
.fileobj
is None:
199 if size
< 0: # get the whole thing
203 readsize
= readsize
* 2
205 size
= self
.extrasize
206 else: # just get some more of it
208 while size
> self
.extrasize
:
210 readsize
= readsize
* 2
212 if size
> self
.extrasize
:
213 size
= self
.extrasize
215 chunk
= self
.extrabuf
[:size
]
216 self
.extrabuf
= self
.extrabuf
[size
:]
217 self
.extrasize
= self
.extrasize
- size
222 def _unread(self
, buf
):
223 self
.extrabuf
= buf
+ self
.extrabuf
224 self
.extrasize
= len(buf
) + self
.extrasize
225 self
.offset
-= len(buf
)
227 def _read(self
, size
=1024):
228 if self
.fileobj
is None: raise EOFError, "Reached EOF"
231 # If the _new_member flag is set, we have to
232 # jump to the next member, if there is one.
234 # First, check if we're at the end of the file;
235 # if so, it's time to stop; no more members to read.
236 pos
= self
.fileobj
.tell() # Save current position
237 self
.fileobj
.seek(0, 2) # Seek to end of file
238 if pos
== self
.fileobj
.tell():
239 raise EOFError, "Reached EOF"
241 self
.fileobj
.seek( pos
) # Return to original position
244 self
._read
_gzip
_header
()
245 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
246 self
._new
_member
= False
248 # Read a chunk of data from the file
249 buf
= self
.fileobj
.read(size
)
251 # If the EOF has been reached, flush the decompression object
252 # and mark this object as finished.
255 uncompress
= self
.decompress
.flush()
257 self
._add
_read
_data
( uncompress
)
258 raise EOFError, 'Reached EOF'
260 uncompress
= self
.decompress
.decompress(buf
)
261 self
._add
_read
_data
( uncompress
)
263 if self
.decompress
.unused_data
!= "":
264 # Ending case: we've come to the end of a member in the file,
265 # so seek back to the start of the unused data, finish up
266 # this member, and read a new gzip header.
267 # (The number of bytes to seek back is the length of the unused
268 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
269 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
271 # Check the CRC and file size, and set the flag so we read
272 # a new member on the next call
274 self
._new
_member
= True
276 def _add_read_data(self
, data
):
277 self
.crc
= zlib
.crc32(data
, self
.crc
)
278 self
.extrabuf
= self
.extrabuf
+ data
279 self
.extrasize
= self
.extrasize
+ len(data
)
280 self
.size
= self
.size
+ len(data
)
283 # We've read to the end of the file, so we have to rewind in order
284 # to reread the 8 bytes containing the CRC and the file size.
285 # We check the that the computed CRC and size of the
286 # uncompressed data matches the stored values.
287 self
.fileobj
.seek(-8, 1)
288 crc32
= read32(self
.fileobj
)
289 isize
= read32(self
.fileobj
)
290 if crc32
%0x100000000L
!= self
.crc
%0x100000000L
:
291 raise ValueError, "CRC check failed"
292 elif isize
!= self
.size
:
293 raise ValueError, "Incorrect length of data produced"
296 if self
.mode
== WRITE
:
297 self
.fileobj
.write(self
.compress
.flush())
298 write32(self
.fileobj
, self
.crc
)
299 write32(self
.fileobj
, self
.size
)
301 elif self
.mode
== READ
:
304 self
.myfileobj
.close()
305 self
.myfileobj
= None
309 if (self
.myfileobj
is None and
310 self
.fileobj
is None):
312 except AttributeError:
326 '''Return the uncompressed stream file position indicator to the
327 beginning of the file'''
328 if self
.mode
!= READ
:
329 raise IOError("Can't rewind in write mode")
331 self
._new
_member
= True
336 def seek(self
, offset
):
337 if self
.mode
== WRITE
:
338 if offset
< self
.offset
:
339 raise IOError('Negative seek in write mode')
340 count
= offset
- self
.offset
341 for i
in range(count
/1024):
342 self
.write(1024*'\0')
343 self
.write((count
%1024)*'\0')
344 elif self
.mode
== READ
:
345 if offset
< self
.offset
:
346 # for negative seek, rewind and do positive seek
348 count
= offset
- self
.offset
349 for i
in range(count
/1024): self
.read(1024)
350 self
.read(count
% 1024)
352 def readline(self
, size
=-1):
353 if size
< 0: size
= sys
.maxint
355 readsize
= min(100, size
) # Read from the file in small chunks
358 return "".join(bufs
) # Return resulting line
360 c
= self
.read(readsize
)
363 # We set i=size to break out of the loop under two
364 # conditions: 1) there's no newline, and the chunk is
365 # larger than size, or 2) there is a newline, but the
366 # resulting line would be longer than 'size'.
367 if i
==-1 and len(c
) > size
: i
=size
-1
368 elif size
<= i
: i
= size
-1
370 if i
>= 0 or c
== '':
371 bufs
.append(c
[:i
+1]) # Add portion of last chunk
372 self
._unread
(c
[i
+1:]) # Push back rest of chunk
373 return ''.join(bufs
) # Return resulting line
375 # Append chunk to list, decrease 'size',
378 readsize
= min(size
, readsize
* 2)
380 def readlines(self
, sizehint
=0):
381 # Negative numbers result in reading all the lines
382 if sizehint
<= 0: sizehint
= sys
.maxint
385 line
= self
.readline()
388 sizehint
= sizehint
- len(line
)
392 def writelines(self
, L
):
400 line
= self
.readline()
408 # Act like gzip; with -d, act like gunzip.
409 # The input file is not deleted, however, nor are any other gzip
410 # options or features supported.
412 decompress
= args
and args
[0] == "-d"
420 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
423 if arg
[-3:] != ".gz":
424 print "filename doesn't end in .gz:", `arg`
427 g
= __builtin__
.open(arg
[:-3], "wb")
431 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
433 f
= __builtin__
.open(arg
, "rb")
434 g
= open(arg
+ ".gz", "wb")
440 if g
is not sys
.stdout
:
442 if f
is not sys
.stdin
:
445 if __name__
== '__main__':