3 * ZIP file directories reader, for the purposes of upload verification.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
24 * A class for reading ZIP file directories, for the purposes of upload
27 * Only a functional interface is provided: ZipFileReader::read(). No access is
28 * given to object instances.
30 class ZipDirectoryReader
{
32 * Read a ZIP file and call a function for each file discovered in it.
34 * Because this class is aimed at verification, an error is raised on
35 * suspicious or ambiguous input, instead of emulating some standard
38 * @param string $fileName The archive file name
39 * @param array $callback The callback function. It will be called for each file
40 * with a single associative array each time, with members:
42 * - name: The file name. Directories conventionally have a trailing
45 * - mtime: The file modification time, in MediaWiki 14-char format
47 * - size: The uncompressed file size
49 * @param array $options An associative array of read options, with the option
50 * name in the key. This may currently contain:
52 * - zip64: If this is set to true, then we will emulate a
53 * library with ZIP64 support, like OpenJDK 7. If it is set to
54 * false, then we will emulate a library with no knowledge of
57 * NOTE: The ZIP64 code is untested and probably doesn't work. It
58 * turned out to be easier to just reject ZIP64 archive uploads,
59 * since they are likely to be very rare. Confirming safety of a
60 * ZIP64 file is fairly complex. What do you do with a file that is
61 * ambiguous and broken when read with a non-ZIP64 reader, but valid
62 * when read with a ZIP64 reader? This situation is normal for a
63 * valid ZIP64 file, and working out what non-ZIP64 readers will make
64 * of such a file is not trivial.
66 * @return Status A Status object. The following fatal errors are defined:
68 * - zip-file-open-error: The file could not be opened.
70 * - zip-wrong-format: The file does not appear to be a ZIP file.
72 * - zip-bad: There was something wrong or ambiguous about the file
75 * - zip-unsupported: The ZIP file uses features which
76 * ZipDirectoryReader does not support.
78 * The default messages for those fatal errors are written in a way that
79 * makes sense for upload verification.
81 * If a fatal error is returned, more information about the error will be
82 * available in the debug log.
84 * Note that the callback function may be called any number of times before
85 * a fatal error is returned. If this occurs, the data sent to the callback
86 * function should be discarded.
88 public static function read( $fileName, $callback, $options = [] ) {
89 $zdr = new self( $fileName, $callback, $options );
91 return $zdr->execute();
97 /** The opened file resource */
100 /** The cached length of the file, or null if it has not been loaded yet. */
101 protected $fileLength;
103 /** A segmented cache of the file contents */
106 /** The file data callback */
109 /** The ZIP64 mode */
110 protected $zip64 = false;
112 /** Stored headers */
113 protected $eocdr, $eocdr64, $eocdr64Locator;
117 /** The "extra field" ID for ZIP64 central directory entries */
118 const ZIP64_EXTRA_HEADER
= 0x0001;
120 /** The segment size for the file contents cache */
121 const SEGSIZE
= 16384;
123 /** The index of the "general field" bit for UTF-8 file names */
124 const GENERAL_UTF8
= 11;
126 /** The index of the "general field" bit for central directory encryption */
127 const GENERAL_CD_ENCRYPTED
= 13;
130 * Private constructor
131 * @param string $fileName
132 * @param callable $callback
133 * @param array $options
135 protected function __construct( $fileName, $callback, $options ) {
136 $this->fileName
= $fileName;
137 $this->callback
= $callback;
139 if ( isset( $options['zip64'] ) ) {
140 $this->zip64
= $options['zip64'];
145 * Read the directory according to settings in $this.
150 $this->file
= fopen( $this->fileName
, 'r' );
152 if ( !$this->file
) {
153 return Status
::newFatal( 'zip-file-open-error' );
156 $status = Status
::newGood();
158 $this->readEndOfCentralDirectoryRecord();
159 if ( $this->zip64
) {
160 list( $offset, $size ) = $this->findZip64CentralDirectory();
161 $this->readCentralDirectory( $offset, $size );
163 if ( $this->eocdr
['CD size'] == 0xffffffff
164 ||
$this->eocdr
['CD offset'] == 0xffffffff
165 ||
$this->eocdr
['CD entries total'] == 0xffff
167 $this->error( 'zip-unsupported', 'Central directory header indicates ZIP64, ' .
168 'but we are in legacy mode. Rejecting this upload is necessary to avoid ' .
169 'opening vulnerabilities on clients using OpenJDK 7 or later.' );
172 list( $offset, $size ) = $this->findOldCentralDirectory();
173 $this->readCentralDirectory( $offset, $size );
175 } catch ( ZipDirectoryReaderError
$e ) {
176 $status->fatal( $e->getErrorCode() );
179 fclose( $this->file
);
185 * Throw an error, and log a debug message
187 * @param string $debugMessage
188 * @throws ZipDirectoryReaderError
190 function error( $code, $debugMessage ) {
191 wfDebug( __CLASS__
. ": Fatal error: $debugMessage\n" );
192 throw new ZipDirectoryReaderError( $code );
196 * Read the header which is at the end of the central directory,
197 * unimaginatively called the "end of central directory record" by the ZIP
200 function readEndOfCentralDirectoryRecord() {
204 'CD start disk' => 2,
205 'CD entries this disk' => 2,
206 'CD entries total' => 2,
209 'file comment length' => 2,
211 $structSize = $this->getStructSize( $info );
212 $startPos = $this->getFileLength() - 65536 - $structSize;
213 if ( $startPos < 0 ) {
217 if ( $this->getFileLength() === 0 ) {
218 $this->error( 'zip-wrong-format', "The file is empty." );
221 $block = $this->getBlock( $startPos );
222 $sigPos = strrpos( $block, "PK\x05\x06" );
223 if ( $sigPos === false ) {
224 $this->error( 'zip-wrong-format',
225 "zip file lacks EOCDR signature. It probably isn't a zip file." );
228 $this->eocdr
= $this->unpack( substr( $block, $sigPos ), $info );
229 $this->eocdr
['EOCDR size'] = $structSize +
$this->eocdr
['file comment length'];
231 if ( $structSize +
$this->eocdr
['file comment length'] != strlen( $block ) - $sigPos ) {
232 $this->error( 'zip-bad', 'trailing bytes after the end of the file comment' );
234 if ( $this->eocdr
['disk'] !== 0
235 ||
$this->eocdr
['CD start disk'] !== 0
237 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR)' );
239 $this->eocdr +
= $this->unpack(
241 [ 'file comment' => [ 'string', $this->eocdr
['file comment length'] ] ],
242 $sigPos +
$structSize );
243 $this->eocdr
['position'] = $startPos +
$sigPos;
247 * Read the header called the "ZIP64 end of central directory locator". An
248 * error will be raised if it does not exist.
250 function readZip64EndOfCentralDirectoryLocator() {
252 'signature' => [ 'string', 4 ],
253 'eocdr64 start disk' => 4,
254 'eocdr64 offset' => 8,
255 'number of disks' => 4,
257 $structSize = $this->getStructSize( $info );
259 $start = $this->getFileLength() - $this->eocdr
['EOCDR size'] - $structSize;
260 $block = $this->getBlock( $start, $structSize );
261 $this->eocdr64Locator
= $data = $this->unpack( $block, $info );
263 if ( $data['signature'] !== "PK\x06\x07" ) {
264 // Note: Java will allow this and continue to read the
265 // EOCDR64, so we have to reject the upload, we can't
266 // just use the EOCDR header instead.
267 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory locator' );
272 * Read the header called the "ZIP64 end of central directory record". It
273 * may replace the regular "end of central directory record" in ZIP64 files.
275 function readZip64EndOfCentralDirectoryRecord() {
276 if ( $this->eocdr64Locator
['eocdr64 start disk'] != 0
277 ||
$this->eocdr64Locator
['number of disks'] != 0
279 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64 locator)' );
283 'signature' => [ 'string', 4 ],
285 'version made by' => 2,
286 'version needed' => 2,
288 'CD start disk' => 4,
289 'CD entries this disk' => 8,
290 'CD entries total' => 8,
294 $structSize = $this->getStructSize( $info );
295 $block = $this->getBlock( $this->eocdr64Locator
['eocdr64 offset'], $structSize );
296 $this->eocdr64
= $data = $this->unpack( $block, $info );
297 if ( $data['signature'] !== "PK\x06\x06" ) {
298 $this->error( 'zip-bad', 'wrong signature on Zip64 end of central directory record' );
300 if ( $data['disk'] !== 0
301 ||
$data['CD start disk'] !== 0
303 $this->error( 'zip-unsupported', 'more than one disk (in EOCDR64)' );
308 * Find the location of the central directory, as would be seen by a
311 * @return array List containing offset, size and end position.
313 function findOldCentralDirectory() {
314 $size = $this->eocdr
['CD size'];
315 $offset = $this->eocdr
['CD offset'];
316 $endPos = $this->eocdr
['position'];
318 // Some readers use the EOCDR position instead of the offset field
319 // to find the directory, so to be safe, we check if they both agree.
320 if ( $offset +
$size != $endPos ) {
321 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
322 'of central directory record' );
325 return [ $offset, $size ];
329 * Find the location of the central directory, as would be seen by a
330 * ZIP64-compliant reader.
332 * @return array List containing offset, size and end position.
334 function findZip64CentralDirectory() {
335 // The spec is ambiguous about the exact rules of precedence between the
336 // ZIP64 headers and the original headers. Here we follow zip_util.c
338 $size = $this->eocdr
['CD size'];
339 $offset = $this->eocdr
['CD offset'];
340 $numEntries = $this->eocdr
['CD entries total'];
341 $endPos = $this->eocdr
['position'];
342 if ( $size == 0xffffffff
343 ||
$offset == 0xffffffff
344 ||
$numEntries == 0xffff
346 $this->readZip64EndOfCentralDirectoryLocator();
348 if ( isset( $this->eocdr64Locator
['eocdr64 offset'] ) ) {
349 $this->readZip64EndOfCentralDirectoryRecord();
350 if ( isset( $this->eocdr64
['CD offset'] ) ) {
351 $size = $this->eocdr64
['CD size'];
352 $offset = $this->eocdr64
['CD offset'];
353 $endPos = $this->eocdr64Locator
['eocdr64 offset'];
357 // Some readers use the EOCDR position instead of the offset field
358 // to find the directory, so to be safe, we check if they both agree.
359 if ( $offset +
$size != $endPos ) {
360 $this->error( 'zip-bad', 'the central directory does not immediately precede the end ' .
361 'of central directory record' );
364 return [ $offset, $size ];
368 * Read the central directory at the given location
372 function readCentralDirectory( $offset, $size ) {
373 $block = $this->getBlock( $offset, $size );
376 'signature' => [ 'string', 4 ],
377 'version made by' => 2,
378 'version needed' => 2,
380 'compression method' => 2,
384 'compressed size' => 4,
385 'uncompressed size' => 4,
387 'extra field length' => 2,
388 'comment length' => 2,
389 'disk number start' => 2,
390 'internal attrs' => 2,
391 'external attrs' => 4,
392 'local header offset' => 4,
394 $fixedSize = $this->getStructSize( $fixedInfo );
397 while ( $pos < $size ) {
398 $data = $this->unpack( $block, $fixedInfo, $pos );
401 if ( $data['signature'] !== "PK\x01\x02" ) {
402 $this->error( 'zip-bad', 'Invalid signature found in directory entry' );
406 'name' => [ 'string', $data['name length'] ],
407 'extra field' => [ 'string', $data['extra field length'] ],
408 'comment' => [ 'string', $data['comment length'] ],
410 $data +
= $this->unpack( $block, $variableInfo, $pos );
411 $pos +
= $this->getStructSize( $variableInfo );
413 if ( $this->zip64
&& (
414 $data['compressed size'] == 0xffffffff
415 ||
$data['uncompressed size'] == 0xffffffff
416 ||
$data['local header offset'] == 0xffffffff )
418 $zip64Data = $this->unpackZip64Extra( $data['extra field'] );
420 $data = $zip64Data +
$data;
424 if ( $this->testBit( $data['general bits'], self
::GENERAL_CD_ENCRYPTED
) ) {
425 $this->error( 'zip-unsupported', 'central directory encryption is not supported' );
428 // Convert the timestamp into MediaWiki format
429 // For the format, please see the MS-DOS 2.0 Programmer's Reference,
430 // pages 3-5 and 3-6.
431 $time = $data['mod time'];
432 $date = $data['mod date'];
434 $year = 1980 +
( $date >> 9 );
435 $month = ( $date >> 5 ) & 15;
437 $hour = ( $time >> 11 ) & 31;
438 $minute = ( $time >> 5 ) & 63;
439 $second = ( $time & 31 ) * 2;
440 $timestamp = sprintf( "%04d%02d%02d%02d%02d%02d",
441 $year, $month, $day, $hour, $minute, $second );
443 // Convert the character set in the file name
444 if ( $this->testBit( $data['general bits'], self
::GENERAL_UTF8
) ) {
445 $name = $data['name'];
447 $name = iconv( 'CP437', 'UTF-8', $data['name'] );
450 // Compile a data array for the user, with a sensible format
453 'mtime' => $timestamp,
454 'size' => $data['uncompressed size'],
456 call_user_func( $this->callback
, $userData );
461 * Interpret ZIP64 "extra field" data and return an associative array.
462 * @param string $extraField
465 function unpackZip64Extra( $extraField ) {
470 $extraHeaderSize = $this->getStructSize( $extraHeaderInfo );
473 'uncompressed size' => 8,
474 'compressed size' => 8,
475 'local header offset' => 8,
476 'disk number start' => 4,
480 while ( $extraPos < strlen( $extraField ) ) {
481 $extra = $this->unpack( $extraField, $extraHeaderInfo, $extraPos );
482 $extraPos +
= $extraHeaderSize;
483 $extra +
= $this->unpack( $extraField,
484 [ 'data' => [ 'string', $extra['size'] ] ],
486 $extraPos +
= $extra['size'];
488 if ( $extra['id'] == self
::ZIP64_EXTRA_HEADER
) {
489 return $this->unpack( $extra['data'], $zip64ExtraInfo );
497 * Get the length of the file.
500 function getFileLength() {
501 if ( $this->fileLength
=== null ) {
502 $stat = fstat( $this->file
);
503 $this->fileLength
= $stat['size'];
506 return $this->fileLength
;
510 * Get the file contents from a given offset. If there are not enough bytes
511 * in the file to satisfy the request, an exception will be thrown.
513 * @param int $start The byte offset of the start of the block.
514 * @param int $length The number of bytes to return. If omitted, the remainder
515 * of the file will be returned.
519 function getBlock( $start, $length = null ) {
520 $fileLength = $this->getFileLength();
521 if ( $start >= $fileLength ) {
522 $this->error( 'zip-bad', "getBlock() requested position $start, " .
523 "file length is $fileLength" );
525 if ( $length === null ) {
526 $length = $fileLength - $start;
528 $end = $start +
$length;
529 if ( $end > $fileLength ) {
530 $this->error( 'zip-bad', "getBlock() requested end position $end, " .
531 "file length is $fileLength" );
533 $startSeg = floor( $start / self
::SEGSIZE
);
534 $endSeg = ceil( $end / self
::SEGSIZE
);
537 for ( $segIndex = $startSeg; $segIndex <= $endSeg; $segIndex++
) {
538 $block .= $this->getSegment( $segIndex );
541 $block = substr( $block,
542 $start - $startSeg * self
::SEGSIZE
,
545 if ( strlen( $block ) < $length ) {
546 $this->error( 'zip-bad', 'getBlock() returned an unexpectedly small amount of data' );
553 * Get a section of the file starting at position $segIndex * self::SEGSIZE,
554 * of length self::SEGSIZE. The result is cached. This is a helper function
557 * If there are not enough bytes in the file to satisfy the request, the
558 * return value will be truncated. If a request is made for a segment beyond
559 * the end of the file, an empty string will be returned.
561 * @param int $segIndex
565 function getSegment( $segIndex ) {
566 if ( !isset( $this->buffer
[$segIndex] ) ) {
567 $bytePos = $segIndex * self
::SEGSIZE
;
568 if ( $bytePos >= $this->getFileLength() ) {
569 $this->buffer
[$segIndex] = '';
573 if ( fseek( $this->file
, $bytePos ) ) {
574 $this->error( 'zip-bad', "seek to $bytePos failed" );
576 $seg = fread( $this->file
, self
::SEGSIZE
);
577 if ( $seg === false ) {
578 $this->error( 'zip-bad', "read from $bytePos failed" );
580 $this->buffer
[$segIndex] = $seg;
583 return $this->buffer
[$segIndex];
587 * Get the size of a structure in bytes. See unpack() for the format of $struct.
588 * @param array $struct
591 function getStructSize( $struct ) {
593 foreach ( $struct as $type ) {
594 if ( is_array( $type ) ) {
595 list( , $fieldSize ) = $type;
606 * Unpack a binary structure. This is like the built-in unpack() function
609 * @param string $string The binary data input
611 * @param array $struct An associative array giving structure members and their
612 * types. In the key is the field name. The value may be either an
613 * integer, in which case the field is a little-endian unsigned integer
614 * encoded in the given number of bytes, or an array, in which case the
615 * first element of the array is the type name, and the subsequent
616 * elements are type-dependent parameters. Only one such type is defined:
617 * - "string": The second array element gives the length of string.
618 * Not null terminated.
620 * @param int $offset The offset into the string at which to start unpacking.
622 * @throws MWException
623 * @return array Unpacked associative array. Note that large integers in the input
624 * may be represented as floating point numbers in the return value, so
625 * the use of weak comparison is advised.
627 function unpack( $string, $struct, $offset = 0 ) {
628 $size = $this->getStructSize( $struct );
629 if ( $offset +
$size > strlen( $string ) ) {
630 $this->error( 'zip-bad', 'unpack() would run past the end of the supplied string' );
635 foreach ( $struct as $key => $type ) {
636 if ( is_array( $type ) ) {
637 list( $typeName, $fieldSize ) = $type;
638 switch ( $typeName ) {
640 $data[$key] = substr( $string, $pos, $fieldSize );
644 throw new MWException( __METHOD__
. ": invalid type \"$typeName\"" );
647 // Unsigned little-endian integer
648 $length = intval( $type );
650 // Calculate the value. Use an algorithm which automatically
651 // upgrades the value to floating point if necessary.
653 for ( $i = $length - 1; $i >= 0; $i-- ) {
655 $value +
= ord( $string[$pos +
$i] );
658 // Throw an exception if there was loss of precision
659 if ( $value > pow( 2, 52 ) ) {
660 $this->error( 'zip-unsupported', 'number too large to be stored in a double. ' .
661 'This could happen if we tried to unpack a 64-bit structure ' .
662 'at an invalid location.' );
664 $data[$key] = $value;
673 * Returns a bit from a given position in an integer value, converted to
677 * @param int $bitIndex The index of the bit, where 0 is the LSB.
680 function testBit( $value, $bitIndex ) {
681 return (bool)( ( $value >> $bitIndex ) & 1 );
685 * Debugging helper function which dumps a string in hexdump -C format.
688 function hexDump( $s ) {
690 for ( $i = 0; $i < $n; $i +
= 16 ) {
691 printf( "%08X ", $i );
692 for ( $j = 0; $j < 16; $j++
) {
697 if ( $i +
$j >= $n ) {
700 printf( "%02X", ord( $s[$i +
$j] ) );
705 for ( $j = 0; $j < 16; $j++
) {
706 if ( $i +
$j >= $n ) {
708 } elseif ( ctype_print( $s[$i +
$j] ) ) {
720 * Internal exception class. Will be caught by private code.
722 class ZipDirectoryReaderError
extends Exception
{
723 protected $errorCode;
725 function __construct( $code ) {
726 $this->errorCode
= $code;
727 parent
::__construct( "ZipDirectoryReader error: $code" );
733 function getErrorCode() {
734 return $this->errorCode
;