- Got rid of newmodule.c
[python/dscho.git] / Lib / zipfile.py
blob5c1c229a93f4cfa383da556dd5014bcf22e02efb
1 "Read and write ZIP files."
2 # Written by James C. Ahlstrom jim@interet.com
3 # All rights transferred to CNRI pursuant to the Python contribution agreement
5 import struct, os, time
6 import binascii
8 try:
9 import zlib # We may need its compression method
10 except ImportError:
11 zlib = None
13 __all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile",
14 "ZipInfo", "ZipFile", "PyZipFile"]
16 class BadZipfile(Exception):
17 pass
18 error = BadZipfile # The exception raised by this module
20 # constants for Zip file compression methods
21 ZIP_STORED = 0
22 ZIP_DEFLATED = 8
23 # Other ZIP compression methods not supported
25 # Here are some struct module formats for reading headers
26 structEndArchive = "<4s4H2lH" # 9 items, end of archive, 22 bytes
27 stringEndArchive = "PK\005\006" # magic number for end of archive record
28 structCentralDir = "<4s4B4H3l5H2l"# 19 items, central directory, 46 bytes
29 stringCentralDir = "PK\001\002" # magic number for central directory
30 structFileHeader = "<4s2B4H3l2H" # 12 items, file header record, 30 bytes
31 stringFileHeader = "PK\003\004" # magic number for file header
33 # indexes of entries in the central directory structure
34 _CD_SIGNATURE = 0
35 _CD_CREATE_VERSION = 1
36 _CD_CREATE_SYSTEM = 2
37 _CD_EXTRACT_VERSION = 3
38 _CD_EXTRACT_SYSTEM = 4 # is this meaningful?
39 _CD_FLAG_BITS = 5
40 _CD_COMPRESS_TYPE = 6
41 _CD_TIME = 7
42 _CD_DATE = 8
43 _CD_CRC = 9
44 _CD_COMPRESSED_SIZE = 10
45 _CD_UNCOMPRESSED_SIZE = 11
46 _CD_FILENAME_LENGTH = 12
47 _CD_EXTRA_FIELD_LENGTH = 13
48 _CD_COMMENT_LENGTH = 14
49 _CD_DISK_NUMBER_START = 15
50 _CD_INTERNAL_FILE_ATTRIBUTES = 16
51 _CD_EXTERNAL_FILE_ATTRIBUTES = 17
52 _CD_LOCAL_HEADER_OFFSET = 18
54 # indexes of entries in the local file header structure
55 _FH_SIGNATURE = 0
56 _FH_EXTRACT_VERSION = 1
57 _FH_EXTRACT_SYSTEM = 2 # is this meaningful?
58 _FH_GENERAL_PURPOSE_FLAG_BITS = 3
59 _FH_COMPRESSION_METHOD = 4
60 _FH_LAST_MOD_TIME = 5
61 _FH_LAST_MOD_DATE = 6
62 _FH_CRC = 7
63 _FH_COMPRESSED_SIZE = 8
64 _FH_UNCOMPRESSED_SIZE = 9
65 _FH_FILENAME_LENGTH = 10
66 _FH_EXTRA_FIELD_LENGTH = 11
68 def is_zipfile(filename):
69 """Quickly see if file is a ZIP file by checking the magic number.
71 Will not accept a ZIP archive with an ending comment.
72 """
73 try:
74 fpin = open(filename, "rb")
75 fpin.seek(-22, 2) # Seek to end-of-file record
76 endrec = fpin.read()
77 fpin.close()
78 if endrec[0:4] == "PK\005\006" and endrec[-2:] == "\000\000":
79 return True # file has correct magic number
80 except IOError:
81 pass
82 return False
85 class ZipInfo:
86 """Class with attributes describing each file in the ZIP archive."""
88 def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
89 self.filename = _normpath(filename) # Name of the file in the archive
90 self.date_time = date_time # year, month, day, hour, min, sec
91 # Standard values:
92 self.compress_type = ZIP_STORED # Type of compression for the file
93 self.comment = "" # Comment for each file
94 self.extra = "" # ZIP extra data
95 self.create_system = 0 # System which created ZIP archive
96 self.create_version = 20 # Version which created ZIP archive
97 self.extract_version = 20 # Version needed to extract archive
98 self.reserved = 0 # Must be zero
99 self.flag_bits = 0 # ZIP flag bits
100 self.volume = 0 # Volume number of file header
101 self.internal_attr = 0 # Internal attributes
102 self.external_attr = 0 # External file attributes
103 # Other attributes are set by class ZipFile:
104 # header_offset Byte offset to the file header
105 # file_offset Byte offset to the start of the file data
106 # CRC CRC-32 of the uncompressed file
107 # compress_size Size of the compressed file
108 # file_size Size of the uncompressed file
110 def FileHeader(self):
111 """Return the per-file header as a string."""
112 dt = self.date_time
113 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
114 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
115 if self.flag_bits & 0x08:
116 # Set these to zero because we write them after the file data
117 CRC = compress_size = file_size = 0
118 else:
119 CRC = self.CRC
120 compress_size = self.compress_size
121 file_size = self.file_size
122 header = struct.pack(structFileHeader, stringFileHeader,
123 self.extract_version, self.reserved, self.flag_bits,
124 self.compress_type, dostime, dosdate, CRC,
125 compress_size, file_size,
126 len(self.filename), len(self.extra))
127 return header + self.filename + self.extra
130 # This is used to ensure paths in generated ZIP files always use
131 # forward slashes as the directory separator, as required by the
132 # ZIP format specification.
133 if os.sep != "/":
134 def _normpath(path):
135 return path.replace(os.sep, "/")
136 else:
137 def _normpath(path):
138 return path
141 class ZipFile:
142 """ Class with methods to open, read, write, close, list zip files.
144 z = ZipFile(file, mode="r", compression=ZIP_STORED)
146 file: Either the path to the file, or a file-like object.
147 If it is a path, the file will be opened and closed by ZipFile.
148 mode: The mode can be either read "r", write "w" or append "a".
149 compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib).
152 fp = None # Set here since __del__ checks it
154 def __init__(self, file, mode="r", compression=ZIP_STORED):
155 """Open the ZIP file with mode read "r", write "w" or append "a"."""
156 if compression == ZIP_STORED:
157 pass
158 elif compression == ZIP_DEFLATED:
159 if not zlib:
160 raise RuntimeError,\
161 "Compression requires the (missing) zlib module"
162 else:
163 raise RuntimeError, "That compression method is not supported"
164 self.debug = 0 # Level of printing: 0 through 3
165 self.NameToInfo = {} # Find file info given name
166 self.filelist = [] # List of ZipInfo instances for archive
167 self.compression = compression # Method of compression
168 self.mode = key = mode[0]
170 # Check if we were passed a file-like object
171 if isinstance(file, basestring):
172 self._filePassed = 0
173 self.filename = file
174 modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
175 self.fp = open(file, modeDict[mode])
176 else:
177 self._filePassed = 1
178 self.fp = file
179 self.filename = getattr(file, 'name', None)
181 if key == 'r':
182 self._GetContents()
183 elif key == 'w':
184 pass
185 elif key == 'a':
186 fp = self.fp
187 fp.seek(-22, 2) # Seek to end-of-file record
188 endrec = fp.read()
189 if endrec[0:4] == stringEndArchive and \
190 endrec[-2:] == "\000\000":
191 self._GetContents() # file is a zip file
192 # seek to start of directory and overwrite
193 fp.seek(self.start_dir, 0)
194 else: # file is not a zip file, just append
195 fp.seek(0, 2)
196 else:
197 if not self._filePassed:
198 self.fp.close()
199 self.fp = None
200 raise RuntimeError, 'Mode must be "r", "w" or "a"'
202 def _GetContents(self):
203 """Read the directory, making sure we close the file if the format
204 is bad."""
205 try:
206 self._RealGetContents()
207 except BadZipfile:
208 if not self._filePassed:
209 self.fp.close()
210 self.fp = None
211 raise
213 def _RealGetContents(self):
214 """Read in the table of contents for the ZIP file."""
215 fp = self.fp
216 fp.seek(-22, 2) # Start of end-of-archive record
217 filesize = fp.tell() + 22 # Get file size
218 endrec = fp.read(22) # Archive must not end with a comment!
219 if endrec[0:4] != stringEndArchive or endrec[-2:] != "\000\000":
220 raise BadZipfile, "File is not a zip file, or ends with a comment"
221 endrec = struct.unpack(structEndArchive, endrec)
222 if self.debug > 1:
223 print endrec
224 size_cd = endrec[5] # bytes in central directory
225 offset_cd = endrec[6] # offset of central directory
226 x = filesize - 22 - size_cd
227 # "concat" is zero, unless zip was concatenated to another file
228 concat = x - offset_cd
229 if self.debug > 2:
230 print "given, inferred, offset", offset_cd, x, concat
231 # self.start_dir: Position of start of central directory
232 self.start_dir = offset_cd + concat
233 fp.seek(self.start_dir, 0)
234 total = 0
235 while total < size_cd:
236 centdir = fp.read(46)
237 total = total + 46
238 if centdir[0:4] != stringCentralDir:
239 raise BadZipfile, "Bad magic number for central directory"
240 centdir = struct.unpack(structCentralDir, centdir)
241 if self.debug > 2:
242 print centdir
243 filename = fp.read(centdir[_CD_FILENAME_LENGTH])
244 # Create ZipInfo instance to store file information
245 x = ZipInfo(filename)
246 x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
247 x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
248 total = (total + centdir[_CD_FILENAME_LENGTH]
249 + centdir[_CD_EXTRA_FIELD_LENGTH]
250 + centdir[_CD_COMMENT_LENGTH])
251 x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] + concat
252 # file_offset must be computed below...
253 (x.create_version, x.create_system, x.extract_version, x.reserved,
254 x.flag_bits, x.compress_type, t, d,
255 x.CRC, x.compress_size, x.file_size) = centdir[1:12]
256 x.volume, x.internal_attr, x.external_attr = centdir[15:18]
257 # Convert date/time code to (year, month, day, hour, min, sec)
258 x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
259 t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
260 self.filelist.append(x)
261 self.NameToInfo[x.filename] = x
262 if self.debug > 2:
263 print "total", total
264 for data in self.filelist:
265 fp.seek(data.header_offset, 0)
266 fheader = fp.read(30)
267 if fheader[0:4] != stringFileHeader:
268 raise BadZipfile, "Bad magic number for file header"
269 fheader = struct.unpack(structFileHeader, fheader)
270 # file_offset is computed here, since the extra field for
271 # the central directory and for the local file header
272 # refer to different fields, and they can have different
273 # lengths
274 data.file_offset = (data.header_offset + 30
275 + fheader[_FH_FILENAME_LENGTH]
276 + fheader[_FH_EXTRA_FIELD_LENGTH])
277 fname = fp.read(fheader[_FH_FILENAME_LENGTH])
278 if fname != data.filename:
279 raise RuntimeError, \
280 'File name in directory "%s" and header "%s" differ.' % (
281 data.filename, fname)
283 def namelist(self):
284 """Return a list of file names in the archive."""
285 l = []
286 for data in self.filelist:
287 l.append(data.filename)
288 return l
290 def infolist(self):
291 """Return a list of class ZipInfo instances for files in the
292 archive."""
293 return self.filelist
295 def printdir(self):
296 """Print a table of contents for the zip file."""
297 print "%-46s %19s %12s" % ("File Name", "Modified ", "Size")
298 for zinfo in self.filelist:
299 date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time
300 print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
302 def testzip(self):
303 """Read all the files and check the CRC."""
304 for zinfo in self.filelist:
305 try:
306 self.read(zinfo.filename) # Check CRC-32
307 except:
308 return zinfo.filename
310 def getinfo(self, name):
311 """Return the instance of ZipInfo given 'name'."""
312 return self.NameToInfo[name]
314 def read(self, name):
315 """Return file bytes (as a string) for name."""
316 if self.mode not in ("r", "a"):
317 raise RuntimeError, 'read() requires mode "r" or "a"'
318 if not self.fp:
319 raise RuntimeError, \
320 "Attempt to read ZIP archive that was already closed"
321 zinfo = self.getinfo(name)
322 filepos = self.fp.tell()
323 self.fp.seek(zinfo.file_offset, 0)
324 bytes = self.fp.read(zinfo.compress_size)
325 self.fp.seek(filepos, 0)
326 if zinfo.compress_type == ZIP_STORED:
327 pass
328 elif zinfo.compress_type == ZIP_DEFLATED:
329 if not zlib:
330 raise RuntimeError, \
331 "De-compression requires the (missing) zlib module"
332 # zlib compress/decompress code by Jeremy Hylton of CNRI
333 dc = zlib.decompressobj(-15)
334 bytes = dc.decompress(bytes)
335 # need to feed in unused pad byte so that zlib won't choke
336 ex = dc.decompress('Z') + dc.flush()
337 if ex:
338 bytes = bytes + ex
339 else:
340 raise BadZipfile, \
341 "Unsupported compression method %d for file %s" % \
342 (zinfo.compress_type, name)
343 crc = binascii.crc32(bytes)
344 if crc != zinfo.CRC:
345 raise BadZipfile, "Bad CRC-32 for file %s" % name
346 return bytes
348 def _writecheck(self, zinfo):
349 """Check for errors before writing a file to the archive."""
350 if zinfo.filename in self.NameToInfo:
351 if self.debug: # Warning for duplicate names
352 print "Duplicate name:", zinfo.filename
353 if self.mode not in ("w", "a"):
354 raise RuntimeError, 'write() requires mode "w" or "a"'
355 if not self.fp:
356 raise RuntimeError, \
357 "Attempt to write ZIP archive that was already closed"
358 if zinfo.compress_type == ZIP_DEFLATED and not zlib:
359 raise RuntimeError, \
360 "Compression requires the (missing) zlib module"
361 if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED):
362 raise RuntimeError, \
363 "That compression method is not supported"
365 def write(self, filename, arcname=None, compress_type=None):
366 """Put the bytes from filename into the archive under the name
367 arcname."""
368 st = os.stat(filename)
369 mtime = time.localtime(st.st_mtime)
370 date_time = mtime[0:6]
371 # Create ZipInfo instance to store file information
372 if arcname is None:
373 zinfo = ZipInfo(filename, date_time)
374 else:
375 zinfo = ZipInfo(arcname, date_time)
376 zinfo.external_attr = st[0] << 16 # Unix attributes
377 if compress_type is None:
378 zinfo.compress_type = self.compression
379 else:
380 zinfo.compress_type = compress_type
381 self._writecheck(zinfo)
382 fp = open(filename, "rb")
383 zinfo.flag_bits = 0x00
384 zinfo.header_offset = self.fp.tell() # Start of header bytes
385 # Must overwrite CRC and sizes with correct data later
386 zinfo.CRC = CRC = 0
387 zinfo.compress_size = compress_size = 0
388 zinfo.file_size = file_size = 0
389 self.fp.write(zinfo.FileHeader())
390 zinfo.file_offset = self.fp.tell() # Start of file bytes
391 if zinfo.compress_type == ZIP_DEFLATED:
392 cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
393 zlib.DEFLATED, -15)
394 else:
395 cmpr = None
396 while 1:
397 buf = fp.read(1024 * 8)
398 if not buf:
399 break
400 file_size = file_size + len(buf)
401 CRC = binascii.crc32(buf, CRC)
402 if cmpr:
403 buf = cmpr.compress(buf)
404 compress_size = compress_size + len(buf)
405 self.fp.write(buf)
406 fp.close()
407 if cmpr:
408 buf = cmpr.flush()
409 compress_size = compress_size + len(buf)
410 self.fp.write(buf)
411 zinfo.compress_size = compress_size
412 else:
413 zinfo.compress_size = file_size
414 zinfo.CRC = CRC
415 zinfo.file_size = file_size
416 # Seek backwards and write CRC and file sizes
417 position = self.fp.tell() # Preserve current position in file
418 self.fp.seek(zinfo.header_offset + 14, 0)
419 self.fp.write(struct.pack("<lll", zinfo.CRC, zinfo.compress_size,
420 zinfo.file_size))
421 self.fp.seek(position, 0)
422 self.filelist.append(zinfo)
423 self.NameToInfo[zinfo.filename] = zinfo
425 def writestr(self, zinfo, bytes):
426 """Write a file into the archive. The contents is the string
427 'bytes'."""
428 self._writecheck(zinfo)
429 zinfo.file_size = len(bytes) # Uncompressed size
430 zinfo.CRC = binascii.crc32(bytes) # CRC-32 checksum
431 if zinfo.compress_type == ZIP_DEFLATED:
432 co = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
433 zlib.DEFLATED, -15)
434 bytes = co.compress(bytes) + co.flush()
435 zinfo.compress_size = len(bytes) # Compressed size
436 else:
437 zinfo.compress_size = zinfo.file_size
438 zinfo.header_offset = self.fp.tell() # Start of header bytes
439 self.fp.write(zinfo.FileHeader())
440 zinfo.file_offset = self.fp.tell() # Start of file bytes
441 self.fp.write(bytes)
442 if zinfo.flag_bits & 0x08:
443 # Write CRC and file sizes after the file data
444 self.fp.write(struct.pack("<lll", zinfo.CRC, zinfo.compress_size,
445 zinfo.file_size))
446 self.filelist.append(zinfo)
447 self.NameToInfo[zinfo.filename] = zinfo
449 def __del__(self):
450 """Call the "close()" method in case the user forgot."""
451 self.close()
453 def close(self):
454 """Close the file, and for mode "w" and "a" write the ending
455 records."""
456 if self.fp is None:
457 return
458 if self.mode in ("w", "a"): # write ending records
459 count = 0
460 pos1 = self.fp.tell()
461 for zinfo in self.filelist: # write central directory
462 count = count + 1
463 dt = zinfo.date_time
464 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
465 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
466 centdir = struct.pack(structCentralDir,
467 stringCentralDir, zinfo.create_version,
468 zinfo.create_system, zinfo.extract_version, zinfo.reserved,
469 zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
470 zinfo.CRC, zinfo.compress_size, zinfo.file_size,
471 len(zinfo.filename), len(zinfo.extra), len(zinfo.comment),
472 0, zinfo.internal_attr, zinfo.external_attr,
473 zinfo.header_offset)
474 self.fp.write(centdir)
475 self.fp.write(zinfo.filename)
476 self.fp.write(zinfo.extra)
477 self.fp.write(zinfo.comment)
478 pos2 = self.fp.tell()
479 # Write end-of-zip-archive record
480 endrec = struct.pack(structEndArchive, stringEndArchive,
481 0, 0, count, count, pos2 - pos1, pos1, 0)
482 self.fp.write(endrec)
483 self.fp.flush()
484 if not self._filePassed:
485 self.fp.close()
486 self.fp = None
489 class PyZipFile(ZipFile):
490 """Class to create ZIP archives with Python library files and packages."""
492 def writepy(self, pathname, basename = ""):
493 """Add all files from "pathname" to the ZIP archive.
495 If pathname is a package directory, search the directory and
496 all package subdirectories recursively for all *.py and enter
497 the modules into the archive. If pathname is a plain
498 directory, listdir *.py and enter all modules. Else, pathname
499 must be a Python *.py file and the module will be put into the
500 archive. Added modules are always module.pyo or module.pyc.
501 This method will compile the module.py into module.pyc if
502 necessary.
504 dir, name = os.path.split(pathname)
505 if os.path.isdir(pathname):
506 initname = os.path.join(pathname, "__init__.py")
507 if os.path.isfile(initname):
508 # This is a package directory, add it
509 if basename:
510 basename = "%s/%s" % (basename, name)
511 else:
512 basename = name
513 if self.debug:
514 print "Adding package in", pathname, "as", basename
515 fname, arcname = self._get_codename(initname[0:-3], basename)
516 if self.debug:
517 print "Adding", arcname
518 self.write(fname, arcname)
519 dirlist = os.listdir(pathname)
520 dirlist.remove("__init__.py")
521 # Add all *.py files and package subdirectories
522 for filename in dirlist:
523 path = os.path.join(pathname, filename)
524 root, ext = os.path.splitext(filename)
525 if os.path.isdir(path):
526 if os.path.isfile(os.path.join(path, "__init__.py")):
527 # This is a package directory, add it
528 self.writepy(path, basename) # Recursive call
529 elif ext == ".py":
530 fname, arcname = self._get_codename(path[0:-3],
531 basename)
532 if self.debug:
533 print "Adding", arcname
534 self.write(fname, arcname)
535 else:
536 # This is NOT a package directory, add its files at top level
537 if self.debug:
538 print "Adding files from directory", pathname
539 for filename in os.listdir(pathname):
540 path = os.path.join(pathname, filename)
541 root, ext = os.path.splitext(filename)
542 if ext == ".py":
543 fname, arcname = self._get_codename(path[0:-3],
544 basename)
545 if self.debug:
546 print "Adding", arcname
547 self.write(fname, arcname)
548 else:
549 if pathname[-3:] != ".py":
550 raise RuntimeError, \
551 'Files added with writepy() must end with ".py"'
552 fname, arcname = self._get_codename(pathname[0:-3], basename)
553 if self.debug:
554 print "Adding file", arcname
555 self.write(fname, arcname)
557 def _get_codename(self, pathname, basename):
558 """Return (filename, archivename) for the path.
560 Given a module name path, return the correct file path and
561 archive name, compiling if necessary. For example, given
562 /python/lib/string, return (/python/lib/string.pyc, string).
564 file_py = pathname + ".py"
565 file_pyc = pathname + ".pyc"
566 file_pyo = pathname + ".pyo"
567 if os.path.isfile(file_pyo) and \
568 os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime:
569 fname = file_pyo # Use .pyo file
570 elif not os.path.isfile(file_pyc) or \
571 os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime:
572 import py_compile
573 if self.debug:
574 print "Compiling", file_py
575 py_compile.compile(file_py, file_pyc)
576 fname = file_pyc
577 else:
578 fname = file_pyc
579 archivename = os.path.split(fname)[1]
580 if basename:
581 archivename = "%s/%s" % (basename, archivename)
582 return (fname, archivename)