1 """Functions that read and write gzipped files.
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
8 import struct
, sys
, time
12 __all__
= ["GzipFile","open"]
14 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
18 def write32(output
, value
):
19 output
.write(struct
.pack("<l", value
))
21 def write32u(output
, value
):
23 value
= value
+ 0x100000000L
24 output
.write(struct
.pack("<L", value
))
27 return struct
.unpack("<l", input.read(4))[0]
29 def open(filename
, mode
="rb", compresslevel
=9):
30 return GzipFile(filename
, mode
, compresslevel
)
36 def __init__(self
, filename
=None, mode
=None,
37 compresslevel
=9, fileobj
=None):
39 fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
41 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
44 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
49 # Set flag indicating start of a new member
53 self
.filename
= filename
55 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
57 self
._init
_write
(filename
)
58 self
.compress
= zlib
.compressobj(compresslevel
,
64 raise ValueError, "Mode " + mode
+ " not supported"
66 self
.fileobj
= fileobj
68 if self
.mode
== WRITE
:
69 self
._write
_gzip
_header
()
72 s
= repr(self
.fileobj
)
73 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
75 def _init_write(self
, filename
):
76 if filename
[-3:] != '.gz':
77 filename
= filename
+ '.gz'
78 self
.filename
= filename
79 self
.crc
= zlib
.crc32("")
84 def _write_gzip_header(self
):
85 self
.fileobj
.write('\037\213') # magic header
86 self
.fileobj
.write('\010') # compression method
87 fname
= self
.filename
[:-3]
91 self
.fileobj
.write(chr(flags
))
92 write32u(self
.fileobj
, long(time
.time()))
93 self
.fileobj
.write('\002')
94 self
.fileobj
.write('\377')
96 self
.fileobj
.write(fname
+ '\000')
99 self
.crc
= zlib
.crc32("")
102 def _read_gzip_header(self
):
103 magic
= self
.fileobj
.read(2)
104 if magic
!= '\037\213':
105 raise IOError, 'Not a gzipped file'
106 method
= ord( self
.fileobj
.read(1) )
108 raise IOError, 'Unknown compression method'
109 flag
= ord( self
.fileobj
.read(1) )
110 # modtime = self.fileobj.read(4)
111 # extraflag = self.fileobj.read(1)
112 # os = self.fileobj.read(1)
116 # Read & discard the extra field, if present
117 xlen
=ord(self
.fileobj
.read(1))
118 xlen
=xlen
+256*ord(self
.fileobj
.read(1))
119 self
.fileobj
.read(xlen
)
121 # Read and discard a null-terminated string containing the filename
123 s
=self
.fileobj
.read(1)
124 if not s
or s
=='\000': break
126 # Read and discard a null-terminated string containing a comment
128 s
=self
.fileobj
.read(1)
129 if not s
or s
=='\000': break
131 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
134 def write(self
,data
):
135 if self
.fileobj
is None:
136 raise ValueError, "write() on closed GzipFile object"
138 self
.size
= self
.size
+ len(data
)
139 self
.crc
= zlib
.crc32(data
, self
.crc
)
140 self
.fileobj
.write( self
.compress
.compress(data
) )
142 def writelines(self
,lines
):
143 self
.write(" ".join(lines
))
145 def read(self
, size
=-1):
146 if self
.extrasize
<= 0 and self
.fileobj
is None:
150 if size
< 0: # get the whole thing
154 readsize
= readsize
* 2
156 size
= self
.extrasize
157 else: # just get some more of it
159 while size
> self
.extrasize
:
161 readsize
= readsize
* 2
163 if size
> self
.extrasize
:
164 size
= self
.extrasize
166 chunk
= self
.extrabuf
[:size
]
167 self
.extrabuf
= self
.extrabuf
[size
:]
168 self
.extrasize
= self
.extrasize
- size
172 def _unread(self
, buf
):
173 self
.extrabuf
= buf
+ self
.extrabuf
174 self
.extrasize
= len(buf
) + self
.extrasize
176 def _read(self
, size
=1024):
177 if self
.fileobj
is None: raise EOFError, "Reached EOF"
180 # If the _new_member flag is set, we have to
181 # jump to the next member, if there is one.
183 # First, check if we're at the end of the file;
184 # if so, it's time to stop; no more members to read.
185 pos
= self
.fileobj
.tell() # Save current position
186 self
.fileobj
.seek(0, 2) # Seek to end of file
187 if pos
== self
.fileobj
.tell():
189 raise EOFError, "Reached EOF"
191 self
.fileobj
.seek( pos
) # Return to original position
194 self
._read
_gzip
_header
()
195 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
198 # Read a chunk of data from the file
199 buf
= self
.fileobj
.read(size
)
201 # If the EOF has been reached, flush the decompression object
202 # and mark this object as finished.
205 uncompress
= self
.decompress
.flush()
208 self
._add
_read
_data
( uncompress
)
209 raise EOFError, 'Reached EOF'
211 uncompress
= self
.decompress
.decompress(buf
)
212 self
._add
_read
_data
( uncompress
)
214 if self
.decompress
.unused_data
!= "":
215 # Ending case: we've come to the end of a member in the file,
216 # so seek back to the start of the unused data, finish up
217 # this member, and read a new gzip header.
218 # (The number of bytes to seek back is the length of the unused
219 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
220 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
222 # Check the CRC and file size, and set the flag so we read
223 # a new member on the next call
227 def _add_read_data(self
, data
):
228 self
.crc
= zlib
.crc32(data
, self
.crc
)
229 self
.extrabuf
= self
.extrabuf
+ data
230 self
.extrasize
= self
.extrasize
+ len(data
)
231 self
.size
= self
.size
+ len(data
)
234 # We've read to the end of the file, so we have to rewind in order
235 # to reread the 8 bytes containing the CRC and the file size.
236 # We check the that the computed CRC and size of the
237 # uncompressed data matches the stored values.
238 self
.fileobj
.seek(-8, 1)
239 crc32
= read32(self
.fileobj
)
240 isize
= read32(self
.fileobj
)
241 if crc32
%0x100000000L
!= self
.crc
%0x100000000L
:
242 raise ValueError, "CRC check failed"
243 elif isize
!= self
.size
:
244 raise ValueError, "Incorrect length of data produced"
247 if self
.mode
== WRITE
:
248 self
.fileobj
.write(self
.compress
.flush())
249 write32(self
.fileobj
, self
.crc
)
250 write32(self
.fileobj
, self
.size
)
252 elif self
.mode
== READ
:
255 self
.myfileobj
.close()
256 self
.myfileobj
= None
260 if (self
.myfileobj
is None and
261 self
.fileobj
is None):
263 except AttributeError:
271 raise IOError, 'Random access not allowed in gzip files'
274 raise IOError, 'I won\'t tell() you for gzip files'
279 def readline(self
, size
=-1):
280 if size
< 0: size
= sys
.maxint
283 readsize
= min(100, size
) # Read from the file in small chunks
286 return "".join(bufs
) # Return resulting line
288 c
= self
.read(readsize
)
291 # We set i=size to break out of the loop under two
292 # conditions: 1) there's no newline, and the chunk is
293 # larger than size, or 2) there is a newline, but the
294 # resulting line would be longer than 'size'.
295 if i
==-1 and len(c
) > size
: i
=size
-1
296 elif size
<= i
: i
= size
-1
298 if i
>= 0 or c
== '':
299 bufs
.append(c
[:i
+1]) # Add portion of last chunk
300 self
._unread
(c
[i
+1:]) # Push back rest of chunk
301 return ''.join(bufs
) # Return resulting line
303 # Append chunk to list, decrease 'size',
306 readsize
= min(size
, readsize
* 2)
308 def readlines(self
, sizehint
=0):
309 # Negative numbers result in reading all the lines
310 if sizehint
<= 0: sizehint
= sys
.maxint
313 line
= self
.readline()
316 sizehint
= sizehint
- len(line
)
320 def writelines(self
, L
):
326 # Act like gzip; with -d, act like gunzip.
327 # The input file is not deleted, however, nor are any other gzip
328 # options or features supported.
331 decompress
= args
and args
[0] == "-d"
339 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
342 if arg
[-3:] != ".gz":
343 print "filename doesn't end in .gz:", `arg`
346 g
= __builtin__
.open(arg
[:-3], "wb")
350 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
352 f
= __builtin__
.open(arg
, "rb")
353 g
= open(arg
+ ".gz", "wb")
359 if g
is not sys
.stdout
:
361 if f
is not sys
.stdin
:
364 if __name__
== '__main__':