Non-word characters don't terminate tag names.
[mediawiki.git] / includes / media / PNGMetadataExtractor.php
blob34e5fa722ac40978ef04213bf30c946e7b8278e2
1 <?php
2 /**
3 * PNG frame counter and metadata extractor.
5 * Slightly derived from GIFMetadataExtractor.php
6 * Deliberately not using MWExceptions to avoid external dependencies, encouraging
7 * redistribution.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 * http://www.gnu.org/copyleft/gpl.html
24 * @file
25 * @ingroup Media
28 /**
29 * PNG frame counter.
31 * @ingroup Media
33 class PNGMetadataExtractor {
34 static $png_sig;
35 static $CRC_size;
36 static $text_chunks;
38 const VERSION = 1;
39 const MAX_CHUNK_SIZE = 3145728; // 3 megabytes
41 static function getMetadata( $filename ) {
42 self::$png_sig = pack( "C8", 137, 80, 78, 71, 13, 10, 26, 10 );
43 self::$CRC_size = 4;
44 /* based on list at http://owl.phy.queensu.ca/~phil/exiftool/TagNames/PNG.html#TextualData
45 * and http://www.w3.org/TR/PNG/#11keywords
47 self::$text_chunks = array(
48 'xml:com.adobe.xmp' => 'xmp',
49 # Artist is unofficial. Author is the recommended
50 # keyword in the PNG spec. However some people output
51 # Artist so support both.
52 'artist' => 'Artist',
53 'model' => 'Model',
54 'make' => 'Make',
55 'author' => 'Artist',
56 'comment' => 'PNGFileComment',
57 'description' => 'ImageDescription',
58 'title' => 'ObjectName',
59 'copyright' => 'Copyright',
60 # Source as in original device used to make image
61 # not as in who gave you the image
62 'source' => 'Model',
63 'software' => 'Software',
64 'disclaimer' => 'Disclaimer',
65 'warning' => 'ContentWarning',
66 'url' => 'Identifier', # Not sure if this is best mapping. Maybe WebStatement.
67 'label' => 'Label',
68 'creation time' => 'DateTimeDigitized',
69 /* Other potentially useful things - Document */
72 $frameCount = 0;
73 $loopCount = 1;
74 $text = array();
75 $duration = 0.0;
76 $bitDepth = 0;
77 $colorType = 'unknown';
79 if ( !$filename ) {
80 throw new Exception( __METHOD__ . ": No file name specified" );
81 } elseif ( !file_exists( $filename ) || is_dir( $filename ) ) {
82 throw new Exception( __METHOD__ . ": File $filename does not exist" );
85 $fh = fopen( $filename, 'rb' );
87 if ( !$fh ) {
88 throw new Exception( __METHOD__ . ": Unable to open file $filename" );
91 // Check for the PNG header
92 $buf = fread( $fh, 8 );
93 if ( $buf != self::$png_sig ) {
94 throw new Exception( __METHOD__ . ": Not a valid PNG file; header: $buf" );
97 // Read chunks
98 while ( !feof( $fh ) ) {
99 $buf = fread( $fh, 4 );
100 if ( !$buf || strlen( $buf ) < 4 ) {
101 throw new Exception( __METHOD__ . ": Read error" );
103 $chunk_size = unpack( "N", $buf );
104 $chunk_size = $chunk_size[1];
106 if ( $chunk_size < 0 ) {
107 throw new Exception( __METHOD__ . ": Chunk size too big for unpack" );
110 $chunk_type = fread( $fh, 4 );
111 if ( !$chunk_type || strlen( $chunk_type ) < 4 ) {
112 throw new Exception( __METHOD__ . ": Read error" );
115 if ( $chunk_type == "IHDR" ) {
116 $buf = self::read( $fh, $chunk_size );
117 if ( !$buf || strlen( $buf ) < $chunk_size ) {
118 throw new Exception( __METHOD__ . ": Read error" );
120 $bitDepth = ord( substr( $buf, 8, 1 ) );
121 // Detect the color type in British English as per the spec
122 // http://www.w3.org/TR/PNG/#11IHDR
123 switch ( ord( substr( $buf, 9, 1 ) ) ) {
124 case 0:
125 $colorType = 'greyscale';
126 break;
127 case 2:
128 $colorType = 'truecolour';
129 break;
130 case 3:
131 $colorType = 'index-coloured';
132 break;
133 case 4:
134 $colorType = 'greyscale-alpha';
135 break;
136 case 6:
137 $colorType = 'truecolour-alpha';
138 break;
139 default:
140 $colorType = 'unknown';
141 break;
143 } elseif ( $chunk_type == "acTL" ) {
144 $buf = fread( $fh, $chunk_size );
145 if ( !$buf || strlen( $buf ) < $chunk_size || $chunk_size < 4 ) {
146 throw new Exception( __METHOD__ . ": Read error" );
149 $actl = unpack( "Nframes/Nplays", $buf );
150 $frameCount = $actl['frames'];
151 $loopCount = $actl['plays'];
152 } elseif ( $chunk_type == "fcTL" ) {
153 $buf = self::read( $fh, $chunk_size );
154 if ( !$buf || strlen( $buf ) < $chunk_size ) {
155 throw new Exception( __METHOD__ . ": Read error" );
157 $buf = substr( $buf, 20 );
158 if ( strlen( $buf ) < 4 ) {
159 throw new Exception( __METHOD__ . ": Read error" );
162 $fctldur = unpack( "ndelay_num/ndelay_den", $buf );
163 if ( $fctldur['delay_den'] == 0 ) {
164 $fctldur['delay_den'] = 100;
166 if ( $fctldur['delay_num'] ) {
167 $duration += $fctldur['delay_num'] / $fctldur['delay_den'];
169 } elseif ( $chunk_type == "iTXt" ) {
170 // Extracts iTXt chunks, uncompressing if necessary.
171 $buf = self::read( $fh, $chunk_size );
172 $items = array();
173 if ( preg_match(
174 '/^([^\x00]{1,79})\x00(\x00|\x01)\x00([^\x00]*)(.)[^\x00]*\x00(.*)$/Ds',
175 $buf, $items )
177 /* $items[1] = text chunk name, $items[2] = compressed flag,
178 * $items[3] = lang code (or ""), $items[4]= compression type.
179 * $items[5] = content
182 // Theoretically should be case-sensitive, but in practise...
183 $items[1] = strtolower( $items[1] );
184 if ( !isset( self::$text_chunks[$items[1]] ) ) {
185 // Only extract textual chunks on our list.
186 fseek( $fh, self::$CRC_size, SEEK_CUR );
187 continue;
190 $items[3] = strtolower( $items[3] );
191 if ( $items[3] == '' ) {
192 // if no lang specified use x-default like in xmp.
193 $items[3] = 'x-default';
196 // if compressed
197 if ( $items[2] == "\x01" ) {
198 if ( function_exists( 'gzuncompress' ) && $items[4] === "\x00" ) {
199 wfSuppressWarnings();
200 $items[5] = gzuncompress( $items[5] );
201 wfRestoreWarnings();
203 if ( $items[5] === false ) {
204 // decompression failed
205 wfDebug( __METHOD__ . ' Error decompressing iTxt chunk - ' . $items[1] . "\n");
206 fseek( $fh, self::$CRC_size, SEEK_CUR );
207 continue;
210 } else {
211 wfDebug( __METHOD__ . ' Skipping compressed png iTXt chunk due to lack of zlib,'
212 . " or potentially invalid compression method\n" );
213 fseek( $fh, self::$CRC_size, SEEK_CUR );
214 continue;
217 $finalKeyword = self::$text_chunks[$items[1]];
218 $text[$finalKeyword][$items[3]] = $items[5];
219 $text[$finalKeyword]['_type'] = 'lang';
221 } else {
222 // Error reading iTXt chunk
223 throw new Exception( __METHOD__ . ": Read error on iTXt chunk" );
226 } elseif ( $chunk_type == 'tEXt' ) {
227 $buf = self::read( $fh, $chunk_size );
229 // In case there is no \x00 which will make explode fail.
230 if ( strpos( $buf, "\x00" ) === false ) {
231 throw new Exception( __METHOD__ . ": Read error on tEXt chunk" );
234 list( $keyword, $content ) = explode( "\x00", $buf, 2 );
235 if ( $keyword === '' || $content === '' ) {
236 throw new Exception( __METHOD__ . ": Read error on tEXt chunk" );
239 // Theoretically should be case-sensitive, but in practise...
240 $keyword = strtolower( $keyword );
241 if ( !isset( self::$text_chunks[ $keyword ] ) ) {
242 // Don't recognize chunk, so skip.
243 fseek( $fh, self::$CRC_size, SEEK_CUR );
244 continue;
246 wfSuppressWarnings();
247 $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
248 wfRestoreWarnings();
250 if ( $content === false ) {
251 throw new Exception( __METHOD__ . ": Read error (error with iconv)" );
254 $finalKeyword = self::$text_chunks[$keyword];
255 $text[$finalKeyword]['x-default'] = $content;
256 $text[$finalKeyword]['_type'] = 'lang';
258 } elseif ( $chunk_type == 'zTXt' ) {
259 if ( function_exists( 'gzuncompress' ) ) {
260 $buf = self::read( $fh, $chunk_size );
262 // In case there is no \x00 which will make explode fail.
263 if ( strpos( $buf, "\x00" ) === false ) {
264 throw new Exception( __METHOD__ . ": Read error on zTXt chunk" );
267 list( $keyword, $postKeyword ) = explode( "\x00", $buf, 2 );
268 if ( $keyword === '' || $postKeyword === '' ) {
269 throw new Exception( __METHOD__ . ": Read error on zTXt chunk" );
271 // Theoretically should be case-sensitive, but in practise...
272 $keyword = strtolower( $keyword );
274 if ( !isset( self::$text_chunks[ $keyword ] ) ) {
275 // Don't recognize chunk, so skip.
276 fseek( $fh, self::$CRC_size, SEEK_CUR );
277 continue;
279 $compression = substr( $postKeyword, 0, 1 );
280 $content = substr( $postKeyword, 1 );
281 if ( $compression !== "\x00" ) {
282 wfDebug( __METHOD__ . " Unrecognized compression method in zTXt ($keyword). Skipping.\n" );
283 fseek( $fh, self::$CRC_size, SEEK_CUR );
284 continue;
287 wfSuppressWarnings();
288 $content = gzuncompress( $content );
289 wfRestoreWarnings();
291 if ( $content === false ) {
292 // decompression failed
293 wfDebug( __METHOD__ . ' Error decompressing zTXt chunk - ' . $keyword . "\n");
294 fseek( $fh, self::$CRC_size, SEEK_CUR );
295 continue;
298 wfSuppressWarnings();
299 $content = iconv( 'ISO-8859-1', 'UTF-8', $content );
300 wfRestoreWarnings();
302 if ( $content === false ) {
303 throw new Exception( __METHOD__ . ": Read error (error with iconv)" );
306 $finalKeyword = self::$text_chunks[$keyword];
307 $text[$finalKeyword]['x-default'] = $content;
308 $text[$finalKeyword]['_type'] = 'lang';
310 } else {
311 wfDebug( __METHOD__ . " Cannot decompress zTXt chunk due to lack of zlib. Skipping.\n" );
312 fseek( $fh, $chunk_size, SEEK_CUR );
314 } elseif ( $chunk_type == 'tIME' ) {
315 // last mod timestamp.
316 if ( $chunk_size !== 7 ) {
317 throw new Exception( __METHOD__ . ": tIME wrong size" );
319 $buf = self::read( $fh, $chunk_size );
320 if ( !$buf || strlen( $buf ) < $chunk_size ) {
321 throw new Exception( __METHOD__ . ": Read error" );
324 // Note: spec says this should be UTC.
325 $t = unpack( "ny/Cm/Cd/Ch/Cmin/Cs", $buf );
326 $strTime = sprintf( "%04d%02d%02d%02d%02d%02d",
327 $t['y'], $t['m'], $t['d'], $t['h'],
328 $t['min'], $t['s'] );
330 $exifTime = wfTimestamp( TS_EXIF, $strTime );
332 if ( $exifTime ) {
333 $text['DateTime'] = $exifTime;
336 } elseif ( $chunk_type == 'pHYs' ) {
337 // how big pixels are (dots per meter).
338 if ( $chunk_size !== 9 ) {
339 throw new Exception( __METHOD__ . ": pHYs wrong size" );
342 $buf = self::read( $fh, $chunk_size );
343 if ( !$buf || strlen( $buf ) < $chunk_size ) {
344 throw new Exception( __METHOD__ . ": Read error" );
347 $dim = unpack( "Nwidth/Nheight/Cunit", $buf );
348 if ( $dim['unit'] == 1 ) {
349 // Need to check for negative because php
350 // doesn't deal with super-large unsigned 32-bit ints well
351 if ( $dim['width'] > 0 && $dim['height'] > 0 ) {
352 // unit is meters
353 // (as opposed to 0 = undefined )
354 $text['XResolution'] = $dim['width']
355 . '/100';
356 $text['YResolution'] = $dim['height']
357 . '/100';
358 $text['ResolutionUnit'] = 3;
359 // 3 = dots per cm (from Exif).
363 } elseif ( $chunk_type == "IEND" ) {
364 break;
365 } else {
366 fseek( $fh, $chunk_size, SEEK_CUR );
368 fseek( $fh, self::$CRC_size, SEEK_CUR );
370 fclose( $fh );
372 if ( $loopCount > 1 ) {
373 $duration *= $loopCount;
376 if ( isset( $text['DateTimeDigitized'] ) ) {
377 // Convert date format from rfc2822 to exif.
378 foreach ( $text['DateTimeDigitized'] as $name => &$value ) {
379 if ( $name === '_type' ) {
380 continue;
383 // @todo FIXME: Currently timezones are ignored.
384 // possibly should be wfTimestamp's
385 // responsibility. (at least for numeric TZ)
386 $formatted = wfTimestamp( TS_EXIF, $value );
387 if ( $formatted ) {
388 // Only change if we could convert the
389 // date.
390 // The png standard says it should be
391 // in rfc2822 format, but not required.
392 // In general for the exif stuff we
393 // prettify the date if we can, but we
394 // display as-is if we cannot or if
395 // it is invalid.
396 // So do the same here.
398 $value = $formatted;
402 return array(
403 'frameCount' => $frameCount,
404 'loopCount' => $loopCount,
405 'duration' => $duration,
406 'text' => $text,
407 'bitDepth' => $bitDepth,
408 'colorType' => $colorType,
413 * Read a chunk, checking to make sure its not too big.
415 * @param $fh resource The file handle
416 * @param $size Integer size in bytes.
417 * @throws Exception if too big.
418 * @return String The chunk.
420 private static function read( $fh, $size ) {
421 if ( $size > self::MAX_CHUNK_SIZE ) {
422 throw new Exception( __METHOD__ . ': Chunk size of ' . $size .
423 ' too big. Max size is: ' . self::MAX_CHUNK_SIZE );
425 return fread( $fh, $size );