1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
18 def write32(output
, value
):
19 output
.write(struct
.pack("<l", value
))
21 def write32u(output
, value
):
23 value
= value
+ 0x100000000L
24 output
.write(struct
.pack("<L", value
))
27 return struct
.unpack("<l", input.read(4))[0]
29 def open(filename
, mode
="rb", compresslevel
=9):
30 return GzipFile(filename
, mode
, compresslevel
)
36 def __init__(self
, filename
=None, mode
=None,
37 compresslevel
=9, fileobj
=None):
39 fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
41 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
44 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
49 # Set flag indicating start of a new member
53 self
.filename
= filename
55 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
57 self
._init
_write
(filename
)
58 self
.compress
= zlib
.compressobj(compresslevel
,
64 raise ValueError, "Mode " + mode
+ " not supported"
66 self
.fileobj
= fileobj
69 if self
.mode
== WRITE
:
70 self
._write
_gzip
_header
()
73 s
= repr(self
.fileobj
)
74 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
76 def _init_write(self
, filename
):
77 if filename
[-3:] != '.gz':
78 filename
= filename
+ '.gz'
79 self
.filename
= filename
80 self
.crc
= zlib
.crc32("")
85 def _write_gzip_header(self
):
86 self
.fileobj
.write('\037\213') # magic header
87 self
.fileobj
.write('\010') # compression method
88 fname
= self
.filename
[:-3]
92 self
.fileobj
.write(chr(flags
))
93 write32u(self
.fileobj
, long(time
.time()))
94 self
.fileobj
.write('\002')
95 self
.fileobj
.write('\377')
97 self
.fileobj
.write(fname
+ '\000')
100 self
.crc
= zlib
.crc32("")
103 def _read_gzip_header(self
):
104 magic
= self
.fileobj
.read(2)
105 if magic
!= '\037\213':
106 raise IOError, 'Not a gzipped file'
107 method
= ord( self
.fileobj
.read(1) )
109 raise IOError, 'Unknown compression method'
110 flag
= ord( self
.fileobj
.read(1) )
111 # modtime = self.fileobj.read(4)
112 # extraflag = self.fileobj.read(1)
113 # os = self.fileobj.read(1)
117 # Read & discard the extra field, if present
118 xlen
=ord(self
.fileobj
.read(1))
119 xlen
=xlen
+256*ord(self
.fileobj
.read(1))
120 self
.fileobj
.read(xlen
)
122 # Read and discard a null-terminated string containing the filename
124 s
=self
.fileobj
.read(1)
125 if not s
or s
=='\000': break
127 # Read and discard a null-terminated string containing a comment
129 s
=self
.fileobj
.read(1)
130 if not s
or s
=='\000': break
132 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
135 def write(self
,data
):
136 if self
.fileobj
is None:
137 raise ValueError, "write() on closed GzipFile object"
139 self
.size
= self
.size
+ len(data
)
140 self
.crc
= zlib
.crc32(data
, self
.crc
)
141 self
.fileobj
.write( self
.compress
.compress(data
) )
142 self
.offset
+= len(data
)
144 def read(self
, size
=-1):
145 if self
.extrasize
<= 0 and self
.fileobj
is None:
149 if size
< 0: # get the whole thing
153 readsize
= readsize
* 2
155 size
= self
.extrasize
156 else: # just get some more of it
158 while size
> self
.extrasize
:
160 readsize
= readsize
* 2
162 if size
> self
.extrasize
:
163 size
= self
.extrasize
165 chunk
= self
.extrabuf
[:size
]
166 self
.extrabuf
= self
.extrabuf
[size
:]
167 self
.extrasize
= self
.extrasize
- size
172 def _unread(self
, buf
):
173 self
.extrabuf
= buf
+ self
.extrabuf
174 self
.extrasize
= len(buf
) + self
.extrasize
175 self
.offset
-= len(buf
)
177 def _read(self
, size
=1024):
178 if self
.fileobj
is None: raise EOFError, "Reached EOF"
181 # If the _new_member flag is set, we have to
182 # jump to the next member, if there is one.
184 # First, check if we're at the end of the file;
185 # if so, it's time to stop; no more members to read.
186 pos
= self
.fileobj
.tell() # Save current position
187 self
.fileobj
.seek(0, 2) # Seek to end of file
188 if pos
== self
.fileobj
.tell():
189 raise EOFError, "Reached EOF"
191 self
.fileobj
.seek( pos
) # Return to original position
194 self
._read
_gzip
_header
()
195 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
198 # Read a chunk of data from the file
199 buf
= self
.fileobj
.read(size
)
201 # If the EOF has been reached, flush the decompression object
202 # and mark this object as finished.
205 uncompress
= self
.decompress
.flush()
207 self
._add
_read
_data
( uncompress
)
208 raise EOFError, 'Reached EOF'
210 uncompress
= self
.decompress
.decompress(buf
)
211 self
._add
_read
_data
( uncompress
)
213 if self
.decompress
.unused_data
!= "":
214 # Ending case: we've come to the end of a member in the file,
215 # so seek back to the start of the unused data, finish up
216 # this member, and read a new gzip header.
217 # (The number of bytes to seek back is the length of the unused
218 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
219 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
221 # Check the CRC and file size, and set the flag so we read
222 # a new member on the next call
226 def _add_read_data(self
, data
):
227 self
.crc
= zlib
.crc32(data
, self
.crc
)
228 self
.extrabuf
= self
.extrabuf
+ data
229 self
.extrasize
= self
.extrasize
+ len(data
)
230 self
.size
= self
.size
+ len(data
)
233 # We've read to the end of the file, so we have to rewind in order
234 # to reread the 8 bytes containing the CRC and the file size.
235 # We check the that the computed CRC and size of the
236 # uncompressed data matches the stored values.
237 self
.fileobj
.seek(-8, 1)
238 crc32
= read32(self
.fileobj
)
239 isize
= read32(self
.fileobj
)
240 if crc32
%0x100000000L
!= self
.crc
%0x100000000L
:
241 raise ValueError, "CRC check failed"
242 elif isize
!= self
.size
:
243 raise ValueError, "Incorrect length of data produced"
246 if self
.mode
== WRITE
:
247 self
.fileobj
.write(self
.compress
.flush())
248 write32(self
.fileobj
, self
.crc
)
249 write32(self
.fileobj
, self
.size
)
251 elif self
.mode
== READ
:
254 self
.myfileobj
.close()
255 self
.myfileobj
= None
259 if (self
.myfileobj
is None and
260 self
.fileobj
is None):
262 except AttributeError:
276 '''Return the uncompressed stream file position indicator to the
277 beginning of the file'''
278 if self
.mode
!= READ
:
279 raise IOError("Can't rewind in write mode")
286 def seek(self
, offset
):
287 if self
.mode
== WRITE
:
288 if offset
< self
.offset
:
289 raise IOError('Negative seek in write mode')
290 count
= offset
- self
.offset
291 for i
in range(count
/1024):
293 self
.write((count
%1024)*'\0')
294 elif self
.mode
== READ
:
295 if offset
< self
.offset
:
296 # for negative seek, rewind and do positive seek
298 count
= offset
- self
.offset
299 for i
in range(count
/1024): self
.read(1024)
300 self
.read(count
% 1024)
302 def readline(self
, size
=-1):
303 if size
< 0: size
= sys
.maxint
305 readsize
= min(100, size
) # Read from the file in small chunks
308 return "".join(bufs
) # Return resulting line
310 c
= self
.read(readsize
)
313 # We set i=size to break out of the loop under two
314 # conditions: 1) there's no newline, and the chunk is
315 # larger than size, or 2) there is a newline, but the
316 # resulting line would be longer than 'size'.
317 if i
==-1 and len(c
) > size
: i
=size
-1
318 elif size
<= i
: i
= size
-1
320 if i
>= 0 or c
== '':
321 bufs
.append(c
[:i
+1]) # Add portion of last chunk
322 self
._unread
(c
[i
+1:]) # Push back rest of chunk
323 return ''.join(bufs
) # Return resulting line
325 # Append chunk to list, decrease 'size',
328 readsize
= min(size
, readsize
* 2)
330 def readlines(self
, sizehint
=0):
331 # Negative numbers result in reading all the lines
332 if sizehint
<= 0: sizehint
= sys
.maxint
335 line
= self
.readline()
338 sizehint
= sizehint
- len(line
)
342 def writelines(self
, L
):
348 # Act like gzip; with -d, act like gunzip.
349 # The input file is not deleted, however, nor are any other gzip
350 # options or features supported.
352 decompress
= args
and args
[0] == "-d"
360 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
363 if arg
[-3:] != ".gz":
364 print "filename doesn't end in .gz:", `arg`
367 g
= __builtin__
.open(arg
[:-3], "wb")
371 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
373 f
= __builtin__
.open(arg
, "rb")
374 g
= open(arg
+ ".gz", "wb")
380 if g
is not sys
.stdout
:
382 if f
is not sys
.stdin
:
385 if __name__
== '__main__':