append(): Fixing the test for convertability after consultation with
[python/dscho.git] / Lib / zipfile.py
blob4f2b9468a941ffecc3d520e0bb3cd4e970592f75
1 "Read and write ZIP files."
2 # Written by James C. Ahlstrom jim@interet.com
3 # All rights transferred to CNRI pursuant to the Python contribution agreement
5 import struct, os, time
6 import binascii
8 try:
9 import zlib # We may need its compression method
10 except ImportError:
11 zlib = None
13 __all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile",
14 "ZipInfo", "ZipFile", "PyZipFile"]
16 class BadZipfile(Exception):
17 pass
18 error = BadZipfile # The exception raised by this module
20 # constants for Zip file compression methods
21 ZIP_STORED = 0
22 ZIP_DEFLATED = 8
23 # Other ZIP compression methods not supported
25 # Here are some struct module formats for reading headers
26 structEndArchive = "<4s4H2lH" # 9 items, end of archive, 22 bytes
27 stringEndArchive = "PK\005\006" # magic number for end of archive record
28 structCentralDir = "<4s4B4H3l5HLl"# 19 items, central directory, 46 bytes
29 stringCentralDir = "PK\001\002" # magic number for central directory
30 structFileHeader = "<4s2B4H3l2H" # 12 items, file header record, 30 bytes
31 stringFileHeader = "PK\003\004" # magic number for file header
33 # indexes of entries in the central directory structure
34 _CD_SIGNATURE = 0
35 _CD_CREATE_VERSION = 1
36 _CD_CREATE_SYSTEM = 2
37 _CD_EXTRACT_VERSION = 3
38 _CD_EXTRACT_SYSTEM = 4 # is this meaningful?
39 _CD_FLAG_BITS = 5
40 _CD_COMPRESS_TYPE = 6
41 _CD_TIME = 7
42 _CD_DATE = 8
43 _CD_CRC = 9
44 _CD_COMPRESSED_SIZE = 10
45 _CD_UNCOMPRESSED_SIZE = 11
46 _CD_FILENAME_LENGTH = 12
47 _CD_EXTRA_FIELD_LENGTH = 13
48 _CD_COMMENT_LENGTH = 14
49 _CD_DISK_NUMBER_START = 15
50 _CD_INTERNAL_FILE_ATTRIBUTES = 16
51 _CD_EXTERNAL_FILE_ATTRIBUTES = 17
52 _CD_LOCAL_HEADER_OFFSET = 18
54 # indexes of entries in the local file header structure
55 _FH_SIGNATURE = 0
56 _FH_EXTRACT_VERSION = 1
57 _FH_EXTRACT_SYSTEM = 2 # is this meaningful?
58 _FH_GENERAL_PURPOSE_FLAG_BITS = 3
59 _FH_COMPRESSION_METHOD = 4
60 _FH_LAST_MOD_TIME = 5
61 _FH_LAST_MOD_DATE = 6
62 _FH_CRC = 7
63 _FH_COMPRESSED_SIZE = 8
64 _FH_UNCOMPRESSED_SIZE = 9
65 _FH_FILENAME_LENGTH = 10
66 _FH_EXTRA_FIELD_LENGTH = 11
68 def is_zipfile(filename):
69 """Quickly see if file is a ZIP file by checking the magic number."""
70 try:
71 fpin = open(filename, "rb")
72 endrec = _EndRecData(fpin)
73 fpin.close()
74 if endrec:
75 return True # file has correct magic number
76 except IOError:
77 pass
78 return False
80 def _EndRecData(fpin):
81 """Return data from the "End of Central Directory" record, or None.
83 The data is a list of the nine items in the ZIP "End of central dir"
84 record followed by a tenth item, the file seek offset of this record."""
85 fpin.seek(-22, 2) # Assume no archive comment.
86 filesize = fpin.tell() + 22 # Get file size
87 data = fpin.read()
88 if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
89 endrec = struct.unpack(structEndArchive, data)
90 endrec = list(endrec)
91 endrec.append("") # Append the archive comment
92 endrec.append(filesize - 22) # Append the record start offset
93 return endrec
94 # Search the last END_BLOCK bytes of the file for the record signature.
95 # The comment is appended to the ZIP file and has a 16 bit length.
96 # So the comment may be up to 64K long. We limit the search for the
97 # signature to a few Kbytes at the end of the file for efficiency.
98 # also, the signature must not appear in the comment.
99 END_BLOCK = min(filesize, 1024 * 4)
100 fpin.seek(filesize - END_BLOCK, 0)
101 data = fpin.read()
102 start = data.rfind(stringEndArchive)
103 if start >= 0: # Correct signature string was found
104 endrec = struct.unpack(structEndArchive, data[start:start+22])
105 endrec = list(endrec)
106 comment = data[start+22:]
107 if endrec[7] == len(comment): # Comment length checks out
108 # Append the archive comment and start offset
109 endrec.append(comment)
110 endrec.append(filesize - END_BLOCK + start)
111 return endrec
112 return # Error, return None
115 class ZipInfo:
116 """Class with attributes describing each file in the ZIP archive."""
118 def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
119 self.filename = _normpath(filename) # Name of the file in the archive
120 self.date_time = date_time # year, month, day, hour, min, sec
121 # Standard values:
122 self.compress_type = ZIP_STORED # Type of compression for the file
123 self.comment = "" # Comment for each file
124 self.extra = "" # ZIP extra data
125 self.create_system = 0 # System which created ZIP archive
126 self.create_version = 20 # Version which created ZIP archive
127 self.extract_version = 20 # Version needed to extract archive
128 self.reserved = 0 # Must be zero
129 self.flag_bits = 0 # ZIP flag bits
130 self.volume = 0 # Volume number of file header
131 self.internal_attr = 0 # Internal attributes
132 self.external_attr = 0 # External file attributes
133 # Other attributes are set by class ZipFile:
134 # header_offset Byte offset to the file header
135 # file_offset Byte offset to the start of the file data
136 # CRC CRC-32 of the uncompressed file
137 # compress_size Size of the compressed file
138 # file_size Size of the uncompressed file
140 def FileHeader(self):
141 """Return the per-file header as a string."""
142 dt = self.date_time
143 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
144 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
145 if self.flag_bits & 0x08:
146 # Set these to zero because we write them after the file data
147 CRC = compress_size = file_size = 0
148 else:
149 CRC = self.CRC
150 compress_size = self.compress_size
151 file_size = self.file_size
152 header = struct.pack(structFileHeader, stringFileHeader,
153 self.extract_version, self.reserved, self.flag_bits,
154 self.compress_type, dostime, dosdate, CRC,
155 compress_size, file_size,
156 len(self.filename), len(self.extra))
157 return header + self.filename + self.extra
160 # This is used to ensure paths in generated ZIP files always use
161 # forward slashes as the directory separator, as required by the
162 # ZIP format specification.
163 if os.sep != "/":
164 def _normpath(path):
165 return path.replace(os.sep, "/")
166 else:
167 def _normpath(path):
168 return path
171 class ZipFile:
172 """ Class with methods to open, read, write, close, list zip files.
174 z = ZipFile(file, mode="r", compression=ZIP_STORED)
176 file: Either the path to the file, or a file-like object.
177 If it is a path, the file will be opened and closed by ZipFile.
178 mode: The mode can be either read "r", write "w" or append "a".
179 compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib).
182 fp = None # Set here since __del__ checks it
184 def __init__(self, file, mode="r", compression=ZIP_STORED):
185 """Open the ZIP file with mode read "r", write "w" or append "a"."""
186 if compression == ZIP_STORED:
187 pass
188 elif compression == ZIP_DEFLATED:
189 if not zlib:
190 raise RuntimeError,\
191 "Compression requires the (missing) zlib module"
192 else:
193 raise RuntimeError, "That compression method is not supported"
194 self.debug = 0 # Level of printing: 0 through 3
195 self.NameToInfo = {} # Find file info given name
196 self.filelist = [] # List of ZipInfo instances for archive
197 self.compression = compression # Method of compression
198 self.mode = key = mode[0]
200 # Check if we were passed a file-like object
201 if isinstance(file, basestring):
202 self._filePassed = 0
203 self.filename = file
204 modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
205 self.fp = open(file, modeDict[mode])
206 else:
207 self._filePassed = 1
208 self.fp = file
209 self.filename = getattr(file, 'name', None)
211 if key == 'r':
212 self._GetContents()
213 elif key == 'w':
214 pass
215 elif key == 'a':
216 try: # See if file is a zip file
217 self._RealGetContents()
218 # seek to start of directory and overwrite
219 self.fp.seek(self.start_dir, 0)
220 except BadZipfile: # file is not a zip file, just append
221 self.fp.seek(0, 2)
222 else:
223 if not self._filePassed:
224 self.fp.close()
225 self.fp = None
226 raise RuntimeError, 'Mode must be "r", "w" or "a"'
228 def _GetContents(self):
229 """Read the directory, making sure we close the file if the format
230 is bad."""
231 try:
232 self._RealGetContents()
233 except BadZipfile:
234 if not self._filePassed:
235 self.fp.close()
236 self.fp = None
237 raise
239 def _RealGetContents(self):
240 """Read in the table of contents for the ZIP file."""
241 fp = self.fp
242 endrec = _EndRecData(fp)
243 if not endrec:
244 raise BadZipfile, "File is not a zip file"
245 if self.debug > 1:
246 print endrec
247 size_cd = endrec[5] # bytes in central directory
248 offset_cd = endrec[6] # offset of central directory
249 self.comment = endrec[8] # archive comment
250 # endrec[9] is the offset of the "End of Central Dir" record
251 x = endrec[9] - size_cd
252 # "concat" is zero, unless zip was concatenated to another file
253 concat = x - offset_cd
254 if self.debug > 2:
255 print "given, inferred, offset", offset_cd, x, concat
256 # self.start_dir: Position of start of central directory
257 self.start_dir = offset_cd + concat
258 fp.seek(self.start_dir, 0)
259 total = 0
260 while total < size_cd:
261 centdir = fp.read(46)
262 total = total + 46
263 if centdir[0:4] != stringCentralDir:
264 raise BadZipfile, "Bad magic number for central directory"
265 centdir = struct.unpack(structCentralDir, centdir)
266 if self.debug > 2:
267 print centdir
268 filename = fp.read(centdir[_CD_FILENAME_LENGTH])
269 # Create ZipInfo instance to store file information
270 x = ZipInfo(filename)
271 x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
272 x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
273 total = (total + centdir[_CD_FILENAME_LENGTH]
274 + centdir[_CD_EXTRA_FIELD_LENGTH]
275 + centdir[_CD_COMMENT_LENGTH])
276 x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] + concat
277 # file_offset must be computed below...
278 (x.create_version, x.create_system, x.extract_version, x.reserved,
279 x.flag_bits, x.compress_type, t, d,
280 x.CRC, x.compress_size, x.file_size) = centdir[1:12]
281 x.volume, x.internal_attr, x.external_attr = centdir[15:18]
282 # Convert date/time code to (year, month, day, hour, min, sec)
283 x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
284 t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
285 self.filelist.append(x)
286 self.NameToInfo[x.filename] = x
287 if self.debug > 2:
288 print "total", total
289 for data in self.filelist:
290 fp.seek(data.header_offset, 0)
291 fheader = fp.read(30)
292 if fheader[0:4] != stringFileHeader:
293 raise BadZipfile, "Bad magic number for file header"
294 fheader = struct.unpack(structFileHeader, fheader)
295 # file_offset is computed here, since the extra field for
296 # the central directory and for the local file header
297 # refer to different fields, and they can have different
298 # lengths
299 data.file_offset = (data.header_offset + 30
300 + fheader[_FH_FILENAME_LENGTH]
301 + fheader[_FH_EXTRA_FIELD_LENGTH])
302 fname = fp.read(fheader[_FH_FILENAME_LENGTH])
303 if fname != data.filename:
304 raise RuntimeError, \
305 'File name in directory "%s" and header "%s" differ.' % (
306 data.filename, fname)
308 def namelist(self):
309 """Return a list of file names in the archive."""
310 l = []
311 for data in self.filelist:
312 l.append(data.filename)
313 return l
315 def infolist(self):
316 """Return a list of class ZipInfo instances for files in the
317 archive."""
318 return self.filelist
320 def printdir(self):
321 """Print a table of contents for the zip file."""
322 print "%-46s %19s %12s" % ("File Name", "Modified ", "Size")
323 for zinfo in self.filelist:
324 date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time
325 print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
327 def testzip(self):
328 """Read all the files and check the CRC."""
329 for zinfo in self.filelist:
330 try:
331 self.read(zinfo.filename) # Check CRC-32
332 except:
333 return zinfo.filename
335 def getinfo(self, name):
336 """Return the instance of ZipInfo given 'name'."""
337 return self.NameToInfo[name]
339 def read(self, name):
340 """Return file bytes (as a string) for name."""
341 if self.mode not in ("r", "a"):
342 raise RuntimeError, 'read() requires mode "r" or "a"'
343 if not self.fp:
344 raise RuntimeError, \
345 "Attempt to read ZIP archive that was already closed"
346 zinfo = self.getinfo(name)
347 filepos = self.fp.tell()
348 self.fp.seek(zinfo.file_offset, 0)
349 bytes = self.fp.read(zinfo.compress_size)
350 self.fp.seek(filepos, 0)
351 if zinfo.compress_type == ZIP_STORED:
352 pass
353 elif zinfo.compress_type == ZIP_DEFLATED:
354 if not zlib:
355 raise RuntimeError, \
356 "De-compression requires the (missing) zlib module"
357 # zlib compress/decompress code by Jeremy Hylton of CNRI
358 dc = zlib.decompressobj(-15)
359 bytes = dc.decompress(bytes)
360 # need to feed in unused pad byte so that zlib won't choke
361 ex = dc.decompress('Z') + dc.flush()
362 if ex:
363 bytes = bytes + ex
364 else:
365 raise BadZipfile, \
366 "Unsupported compression method %d for file %s" % \
367 (zinfo.compress_type, name)
368 crc = binascii.crc32(bytes)
369 if crc != zinfo.CRC:
370 raise BadZipfile, "Bad CRC-32 for file %s" % name
371 return bytes
373 def _writecheck(self, zinfo):
374 """Check for errors before writing a file to the archive."""
375 if zinfo.filename in self.NameToInfo:
376 if self.debug: # Warning for duplicate names
377 print "Duplicate name:", zinfo.filename
378 if self.mode not in ("w", "a"):
379 raise RuntimeError, 'write() requires mode "w" or "a"'
380 if not self.fp:
381 raise RuntimeError, \
382 "Attempt to write ZIP archive that was already closed"
383 if zinfo.compress_type == ZIP_DEFLATED and not zlib:
384 raise RuntimeError, \
385 "Compression requires the (missing) zlib module"
386 if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED):
387 raise RuntimeError, \
388 "That compression method is not supported"
390 def write(self, filename, arcname=None, compress_type=None):
391 """Put the bytes from filename into the archive under the name
392 arcname."""
393 st = os.stat(filename)
394 mtime = time.localtime(st.st_mtime)
395 date_time = mtime[0:6]
396 # Create ZipInfo instance to store file information
397 if arcname is None:
398 zinfo = ZipInfo(filename, date_time)
399 else:
400 zinfo = ZipInfo(arcname, date_time)
401 zinfo.external_attr = st[0] << 16L # Unix attributes
402 if compress_type is None:
403 zinfo.compress_type = self.compression
404 else:
405 zinfo.compress_type = compress_type
406 self._writecheck(zinfo)
407 fp = open(filename, "rb")
408 zinfo.flag_bits = 0x00
409 zinfo.header_offset = self.fp.tell() # Start of header bytes
410 # Must overwrite CRC and sizes with correct data later
411 zinfo.CRC = CRC = 0
412 zinfo.compress_size = compress_size = 0
413 zinfo.file_size = file_size = 0
414 self.fp.write(zinfo.FileHeader())
415 zinfo.file_offset = self.fp.tell() # Start of file bytes
416 if zinfo.compress_type == ZIP_DEFLATED:
417 cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
418 zlib.DEFLATED, -15)
419 else:
420 cmpr = None
421 while 1:
422 buf = fp.read(1024 * 8)
423 if not buf:
424 break
425 file_size = file_size + len(buf)
426 CRC = binascii.crc32(buf, CRC)
427 if cmpr:
428 buf = cmpr.compress(buf)
429 compress_size = compress_size + len(buf)
430 self.fp.write(buf)
431 fp.close()
432 if cmpr:
433 buf = cmpr.flush()
434 compress_size = compress_size + len(buf)
435 self.fp.write(buf)
436 zinfo.compress_size = compress_size
437 else:
438 zinfo.compress_size = file_size
439 zinfo.CRC = CRC
440 zinfo.file_size = file_size
441 # Seek backwards and write CRC and file sizes
442 position = self.fp.tell() # Preserve current position in file
443 self.fp.seek(zinfo.header_offset + 14, 0)
444 self.fp.write(struct.pack("<lll", zinfo.CRC, zinfo.compress_size,
445 zinfo.file_size))
446 self.fp.seek(position, 0)
447 self.filelist.append(zinfo)
448 self.NameToInfo[zinfo.filename] = zinfo
450 def writestr(self, zinfo, bytes):
451 """Write a file into the archive. The contents is the string
452 'bytes'."""
453 self._writecheck(zinfo)
454 zinfo.file_size = len(bytes) # Uncompressed size
455 zinfo.CRC = binascii.crc32(bytes) # CRC-32 checksum
456 if zinfo.compress_type == ZIP_DEFLATED:
457 co = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
458 zlib.DEFLATED, -15)
459 bytes = co.compress(bytes) + co.flush()
460 zinfo.compress_size = len(bytes) # Compressed size
461 else:
462 zinfo.compress_size = zinfo.file_size
463 zinfo.header_offset = self.fp.tell() # Start of header bytes
464 self.fp.write(zinfo.FileHeader())
465 zinfo.file_offset = self.fp.tell() # Start of file bytes
466 self.fp.write(bytes)
467 if zinfo.flag_bits & 0x08:
468 # Write CRC and file sizes after the file data
469 self.fp.write(struct.pack("<lll", zinfo.CRC, zinfo.compress_size,
470 zinfo.file_size))
471 self.filelist.append(zinfo)
472 self.NameToInfo[zinfo.filename] = zinfo
474 def __del__(self):
475 """Call the "close()" method in case the user forgot."""
476 self.close()
478 def close(self):
479 """Close the file, and for mode "w" and "a" write the ending
480 records."""
481 if self.fp is None:
482 return
483 if self.mode in ("w", "a"): # write ending records
484 count = 0
485 pos1 = self.fp.tell()
486 for zinfo in self.filelist: # write central directory
487 count = count + 1
488 dt = zinfo.date_time
489 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
490 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
491 centdir = struct.pack(structCentralDir,
492 stringCentralDir, zinfo.create_version,
493 zinfo.create_system, zinfo.extract_version, zinfo.reserved,
494 zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
495 zinfo.CRC, zinfo.compress_size, zinfo.file_size,
496 len(zinfo.filename), len(zinfo.extra), len(zinfo.comment),
497 0, zinfo.internal_attr, zinfo.external_attr,
498 zinfo.header_offset)
499 self.fp.write(centdir)
500 self.fp.write(zinfo.filename)
501 self.fp.write(zinfo.extra)
502 self.fp.write(zinfo.comment)
503 pos2 = self.fp.tell()
504 # Write end-of-zip-archive record
505 endrec = struct.pack(structEndArchive, stringEndArchive,
506 0, 0, count, count, pos2 - pos1, pos1, 0)
507 self.fp.write(endrec)
508 self.fp.flush()
509 if not self._filePassed:
510 self.fp.close()
511 self.fp = None
514 class PyZipFile(ZipFile):
515 """Class to create ZIP archives with Python library files and packages."""
517 def writepy(self, pathname, basename = ""):
518 """Add all files from "pathname" to the ZIP archive.
520 If pathname is a package directory, search the directory and
521 all package subdirectories recursively for all *.py and enter
522 the modules into the archive. If pathname is a plain
523 directory, listdir *.py and enter all modules. Else, pathname
524 must be a Python *.py file and the module will be put into the
525 archive. Added modules are always module.pyo or module.pyc.
526 This method will compile the module.py into module.pyc if
527 necessary.
529 dir, name = os.path.split(pathname)
530 if os.path.isdir(pathname):
531 initname = os.path.join(pathname, "__init__.py")
532 if os.path.isfile(initname):
533 # This is a package directory, add it
534 if basename:
535 basename = "%s/%s" % (basename, name)
536 else:
537 basename = name
538 if self.debug:
539 print "Adding package in", pathname, "as", basename
540 fname, arcname = self._get_codename(initname[0:-3], basename)
541 if self.debug:
542 print "Adding", arcname
543 self.write(fname, arcname)
544 dirlist = os.listdir(pathname)
545 dirlist.remove("__init__.py")
546 # Add all *.py files and package subdirectories
547 for filename in dirlist:
548 path = os.path.join(pathname, filename)
549 root, ext = os.path.splitext(filename)
550 if os.path.isdir(path):
551 if os.path.isfile(os.path.join(path, "__init__.py")):
552 # This is a package directory, add it
553 self.writepy(path, basename) # Recursive call
554 elif ext == ".py":
555 fname, arcname = self._get_codename(path[0:-3],
556 basename)
557 if self.debug:
558 print "Adding", arcname
559 self.write(fname, arcname)
560 else:
561 # This is NOT a package directory, add its files at top level
562 if self.debug:
563 print "Adding files from directory", pathname
564 for filename in os.listdir(pathname):
565 path = os.path.join(pathname, filename)
566 root, ext = os.path.splitext(filename)
567 if ext == ".py":
568 fname, arcname = self._get_codename(path[0:-3],
569 basename)
570 if self.debug:
571 print "Adding", arcname
572 self.write(fname, arcname)
573 else:
574 if pathname[-3:] != ".py":
575 raise RuntimeError, \
576 'Files added with writepy() must end with ".py"'
577 fname, arcname = self._get_codename(pathname[0:-3], basename)
578 if self.debug:
579 print "Adding file", arcname
580 self.write(fname, arcname)
582 def _get_codename(self, pathname, basename):
583 """Return (filename, archivename) for the path.
585 Given a module name path, return the correct file path and
586 archive name, compiling if necessary. For example, given
587 /python/lib/string, return (/python/lib/string.pyc, string).
589 file_py = pathname + ".py"
590 file_pyc = pathname + ".pyc"
591 file_pyo = pathname + ".pyo"
592 if os.path.isfile(file_pyo) and \
593 os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime:
594 fname = file_pyo # Use .pyo file
595 elif not os.path.isfile(file_pyc) or \
596 os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime:
597 import py_compile
598 if self.debug:
599 print "Compiling", file_py
600 py_compile.compile(file_py, file_pyc)
601 fname = file_pyc
602 else:
603 fname = file_pyc
604 archivename = os.path.split(fname)[1]
605 if basename:
606 archivename = "%s/%s" % (basename, archivename)
607 return (fname, archivename)