7 # implements a python function that reads and writes a gzipped file
8 # the user of the file doesn't have to worry about the compression,
9 # but random access is not allowed
11 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
13 FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
17 def write32(output
, value
):
18 output
.write(struct
.pack("<l", value
))
20 def write32u(output
, value
):
21 output
.write(struct
.pack("<L", value
))
24 return struct
.unpack("<l", input.read(4))[0]
26 def open(filename
, mode
="rb", compresslevel
=9):
27 return GzipFile(filename
, mode
, compresslevel
)
33 def __init__(self
, filename
=None, mode
=None,
34 compresslevel
=9, fileobj
=None):
36 fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
38 if hasattr(fileobj
, 'name'): filename
= fileobj
.name
41 if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
46 # Set flag indicating start of a new member
50 self
.filename
= filename
52 elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
54 self
._init
_write
(filename
)
55 self
.compress
= zlib
.compressobj(compresslevel
,
61 raise ValueError, "Mode " + mode
+ " not supported"
63 self
.fileobj
= fileobj
65 if self
.mode
== WRITE
:
66 self
._write
_gzip
_header
()
69 s
= repr(self
.fileobj
)
70 return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
72 def _init_write(self
, filename
):
73 if filename
[-3:] != '.gz':
74 filename
= filename
+ '.gz'
75 self
.filename
= filename
76 self
.crc
= zlib
.crc32("")
81 def _write_gzip_header(self
):
82 self
.fileobj
.write('\037\213') # magic header
83 self
.fileobj
.write('\010') # compression method
84 fname
= self
.filename
[:-3]
88 self
.fileobj
.write(chr(flags
))
89 write32u(self
.fileobj
, long(time
.time()))
90 self
.fileobj
.write('\002')
91 self
.fileobj
.write('\377')
93 self
.fileobj
.write(fname
+ '\000')
96 self
.crc
= zlib
.crc32("")
99 def _read_gzip_header(self
):
100 magic
= self
.fileobj
.read(2)
101 if magic
!= '\037\213':
102 raise IOError, 'Not a gzipped file'
103 method
= ord( self
.fileobj
.read(1) )
105 raise IOError, 'Unknown compression method'
106 flag
= ord( self
.fileobj
.read(1) )
107 # modtime = self.fileobj.read(4)
108 # extraflag = self.fileobj.read(1)
109 # os = self.fileobj.read(1)
113 # Read & discard the extra field, if present
114 xlen
=ord(self
.fileobj
.read(1))
115 xlen
=xlen
+256*ord(self
.fileobj
.read(1))
116 self
.fileobj
.read(xlen
)
118 # Read and discard a null-terminated string containing the filename
120 s
=self
.fileobj
.read(1)
121 if not s
or s
=='\000': break
123 # Read and discard a null-terminated string containing a comment
125 s
=self
.fileobj
.read(1)
126 if not s
or s
=='\000': break
128 self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
131 def write(self
,data
):
132 if self
.fileobj
is None:
133 raise ValueError, "write() on closed GzipFile object"
135 self
.size
= self
.size
+ len(data
)
136 self
.crc
= zlib
.crc32(data
, self
.crc
)
137 self
.fileobj
.write( self
.compress
.compress(data
) )
139 def writelines(self
,lines
):
140 self
.write(string
.join(lines
))
142 def read(self
, size
=None):
143 if self
.extrasize
<= 0 and self
.fileobj
is None:
147 if not size
: # get the whole thing
151 readsize
= readsize
* 2
153 size
= self
.extrasize
154 else: # just get some more of it
156 while size
> self
.extrasize
:
158 readsize
= readsize
* 2
160 if size
> self
.extrasize
:
161 size
= self
.extrasize
163 chunk
= self
.extrabuf
[:size
]
164 self
.extrabuf
= self
.extrabuf
[size
:]
165 self
.extrasize
= self
.extrasize
- size
169 def _unread(self
, buf
):
170 self
.extrabuf
= buf
+ self
.extrabuf
171 self
.extrasize
= len(buf
) + self
.extrasize
173 def _read(self
, size
=1024):
174 if self
.fileobj
is None: raise EOFError, "Reached EOF"
177 # If the _new_member flag is set, we have to
179 # First, check if we're at the end of the file;
180 # if so, it's time to stop; no more members to read.
181 pos
= self
.fileobj
.tell() # Save current position
182 self
.fileobj
.seek(0, 2) # Seek to end of file
183 if pos
== self
.fileobj
.tell():
185 raise EOFError, "Reached EOF"
187 self
.fileobj
.seek( pos
) # Return to original position
190 self
._read
_gzip
_header
()
191 self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
194 # Read a chunk of data from the file
195 buf
= self
.fileobj
.read(size
)
197 # If the EOF has been reached, flush the decompression object
198 # and mark this object as finished.
201 uncompress
= self
.decompress
.flush()
204 self
._add
_read
_data
( uncompress
)
205 raise EOFError, 'Reached EOF'
207 uncompress
= self
.decompress
.decompress(buf
)
208 self
._add
_read
_data
( uncompress
)
210 if self
.decompress
.unused_data
!= "":
211 # Ending case: we've come to the end of a member in the file,
212 # so seek back to the start of the unused data, finish up
213 # this member, and read a new gzip header.
214 # (The number of bytes to seek back is the length of the unused
215 # data, minus 8 because _read_eof() will rewind a further 8 bytes)
216 self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
218 # Check the CRC and file size, and set the flag so we read
219 # a new member on the next call
223 def _add_read_data(self
, data
):
224 self
.crc
= zlib
.crc32(data
, self
.crc
)
225 self
.extrabuf
= self
.extrabuf
+ data
226 self
.extrasize
= self
.extrasize
+ len(data
)
227 self
.size
= self
.size
+ len(data
)
230 # We've read to the end of the file, so we have to rewind in order
231 # to reread the 8 bytes containing the CRC and the file size.
232 # We check the that the computed CRC and size of the
233 # uncompressed data matches the stored values.
234 self
.fileobj
.seek(-8, 1)
235 crc32
= read32(self
.fileobj
)
236 isize
= read32(self
.fileobj
)
237 if crc32
%0x100000000L
!= self
.crc
%0x100000000L
:
238 raise ValueError, "CRC check failed"
239 elif isize
!= self
.size
:
240 raise ValueError, "Incorrect length of data produced"
243 if self
.mode
== WRITE
:
244 self
.fileobj
.write(self
.compress
.flush())
245 write32(self
.fileobj
, self
.crc
)
246 write32(self
.fileobj
, self
.size
)
248 elif self
.mode
== READ
:
251 self
.myfileobj
.close()
252 self
.myfileobj
= None
255 if (self
.myfileobj
is not None or
256 self
.fileobj
is not None):
263 raise IOError, 'Random access not allowed in gzip files'
266 raise IOError, 'I won\'t tell() you for gzip files'
275 c
= self
.read(readsize
)
276 i
= string
.find(c
, '\n')
277 if i
>= 0 or c
== '':
279 self
._unread
(c
[i
+1:])
280 return string
.join(bufs
, '')
282 readsize
= readsize
* 2
286 lines
= string
.split(buf
, '\n')
287 for i
in range(len(lines
)-1):
288 lines
[i
] = lines
[i
] + '\n'
289 if lines
and not lines
[-1]:
293 def writelines(self
, L
):
299 # Act like gzip; with -d, act like gunzip.
300 # The input file is not deleted, however, nor are any other gzip
301 # options or features supported.
304 decompress
= args
and args
[0] == "-d"
312 f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
315 if arg
[-3:] != ".gz":
316 print "filename doesn't end in .gz:", `arg`
319 g
= __builtin__
.open(arg
[:-3], "wb")
323 g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
325 f
= __builtin__
.open(arg
, "rb")
326 g
= open(arg
+ ".gz", "wb")
332 if g
is not sys
.stdout
:
334 if f
is not sys
.stdin
:
337 if __name__
== '__main__':