Merge ".mailmap: Correct two contributor names"
[mediawiki.git] / includes / media / JpegMetadataExtractor.php
blobb4ba8df5a0b681e6dabcdeeb1dda986159aa5d25
1 <?php
2 /**
3 * Extraction of JPEG image metadata.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
20 * @file
21 * @ingroup Media
24 use MediaWiki\Libs\UnpackFailedException;
25 use Wikimedia\AtEase\AtEase;
26 use Wikimedia\XMPReader\Reader as XMPReader;
28 /**
29 * Class for reading jpegs and extracting metadata.
30 * see also BitmapMetadataHandler.
32 * Based somewhat on GIFMetadataExtractor.
34 * @ingroup Media
36 class JpegMetadataExtractor {
37 /**
38 * The max segment is a safety check. A JPEG file should never even remotely have
39 * that many segments. Your average file has about 10.
41 private const MAX_JPEG_SEGMENTS = 200;
43 /** Function to extract metadata segments of interest from jpeg files
44 * based on GIFMetadataExtractor.
46 * we can almost use getimagesize to do this
47 * but gis doesn't support having multiple app1 segments
48 * and those can't extract xmp on files containing both exif and xmp data
50 * @param string $filename Name of jpeg file
51 * @return array Array of interesting segments.
52 * @throws InvalidJpegException
54 public static function segmentSplitter( $filename ) {
55 $showXMP = XMPReader::isSupported();
57 $segmentCount = 0;
59 $segments = [
60 'XMP_ext' => [],
61 'COM' => [],
62 'PSIR' => [],
65 if ( !$filename ) {
66 throw new InvalidJpegException( "No filename specified for " . __METHOD__ );
68 if ( !file_exists( $filename ) || is_dir( $filename ) ) {
69 throw new InvalidJpegException( "Invalid file $filename passed to " . __METHOD__ );
72 $fh = fopen( $filename, "rb" );
74 if ( !$fh ) {
75 throw new InvalidJpegException( "Could not open file $filename" );
78 $buffer = fread( $fh, 2 );
79 if ( $buffer !== "\xFF\xD8" ) {
80 throw new InvalidJpegException( "Not a jpeg, no SOI" );
82 while ( !feof( $fh ) ) {
83 $buffer = fread( $fh, 1 );
84 $segmentCount++;
85 if ( $segmentCount > self::MAX_JPEG_SEGMENTS ) {
86 throw new InvalidJpegException( 'Too many jpeg segments. Aborting' );
88 while ( $buffer !== "\xFF" && !feof( $fh ) ) {
89 // In theory JPEG files are not allowed to contain anything between the sections,
90 // but in practice they sometimes do. It's customary to ignore the garbage data.
91 $buffer = fread( $fh, 1 );
94 $buffer = fread( $fh, 1 );
95 while ( $buffer === "\xFF" && !feof( $fh ) ) {
96 // Skip through any 0xFF padding bytes.
97 $buffer = fread( $fh, 1 );
99 if ( $buffer === "\xFE" ) {
100 // COM section -- file comment
101 // First see if valid utf-8,
102 // if not try to convert it to windows-1252.
103 $com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
104 UtfNormal\Validator::quickIsNFCVerify( $com );
105 // turns $com to valid utf-8.
106 // thus if no change, it's utf-8, otherwise it's something else.
107 if ( $com !== $oldCom ) {
108 AtEase::suppressWarnings();
109 $com = $oldCom = iconv( 'windows-1252', 'UTF-8//IGNORE', $oldCom );
110 AtEase::restoreWarnings();
112 // Try it again, if it's still not a valid string, then probably
113 // binary junk or some really weird encoding, so don't extract.
114 UtfNormal\Validator::quickIsNFCVerify( $com );
115 if ( $com === $oldCom ) {
116 $segments["COM"][] = $oldCom;
117 } else {
118 wfDebug( __METHOD__ . " Ignoring JPEG comment as is garbage." );
120 } elseif ( $buffer === "\xE1" ) {
121 // APP1 section (Exif, XMP, and XMP extended)
122 // only extract if XMP is enabled.
123 $temp = self::jpegExtractMarker( $fh );
124 // check what type of app segment this is.
125 if ( substr( $temp, 0, 29 ) === "http://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
126 // use trim to remove trailing \0 chars
127 $segments["XMP"] = trim( substr( $temp, 29 ) );
128 } elseif ( substr( $temp, 0, 35 ) === "http://ns.adobe.com/xmp/extension/\x00" && $showXMP ) {
129 // use trim to remove trailing \0 chars
130 $segments["XMP_ext"][] = trim( substr( $temp, 35 ) );
131 } elseif ( substr( $temp, 0, 29 ) === "XMP\x00://ns.adobe.com/xap/1.0/\x00" && $showXMP ) {
132 // Some images (especially flickr images) seem to have this.
133 // I really have no idea what the deal is with them, but
134 // whatever...
135 // use trim to remove trailing \0 chars
136 $segments["XMP"] = trim( substr( $temp, 29 ) );
137 wfDebug( __METHOD__ . ' Found XMP section with wrong app identifier '
138 . "Using anyways." );
139 } elseif ( substr( $temp, 0, 6 ) === "Exif\0\0" ) {
140 // Just need to find out what the byte order is.
141 // because php's exif plugin sucks...
142 // This is a II for little Endian, MM for big. Not a unicode BOM.
143 $byteOrderMarker = substr( $temp, 6, 2 );
144 if ( $byteOrderMarker === 'MM' ) {
145 $segments['byteOrder'] = 'BE';
146 } elseif ( $byteOrderMarker === 'II' ) {
147 $segments['byteOrder'] = 'LE';
148 } else {
149 wfDebug( __METHOD__ . " Invalid byte ordering?!" );
152 } elseif ( $buffer === "\xED" ) {
153 // APP13 - PSIR. IPTC and some photoshop stuff
154 $temp = self::jpegExtractMarker( $fh );
155 if ( substr( $temp, 0, 14 ) === "Photoshop 3.0\x00" ) {
156 $segments["PSIR"][] = $temp;
158 } elseif ( $buffer === "\xD9" || $buffer === "\xDA" ) {
159 // EOI - end of image or SOS - start of scan. either way we're past any interesting segments
160 return $segments;
161 } elseif ( in_array( $buffer, [
162 "\xC0", "\xC1", "\xC2", "\xC3", "\xC5", "\xC6", "\xC7",
163 "\xC9", "\xCA", "\xCB", "\xCD", "\xCE", "\xCF" ] )
165 // SOF0, SOF1, SOF2, ... (same list as getimagesize)
166 $temp = self::jpegExtractMarker( $fh );
167 try {
168 $segments["SOF"] = StringUtils::unpack( 'Cbits/nheight/nwidth/Ccomponents', $temp );
169 } catch ( UnpackFailedException $e ) {
170 throw new InvalidJpegException( $e->getMessage() );
172 } else {
173 // segment we don't care about, so skip
174 try {
175 $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 );
176 } catch ( UnpackFailedException $e ) {
177 throw new InvalidJpegException( $e->getMessage() );
179 if ( $size['int'] < 2 ) {
180 throw new InvalidJpegException( "invalid marker size in jpeg" );
182 // Note it's possible to seek beyond end of file if truncated.
183 // fseek doesn't report a failure in this case.
184 fseek( $fh, $size['int'] - 2, SEEK_CUR );
187 // shouldn't get here.
188 throw new InvalidJpegException( "Reached end of jpeg file unexpectedly" );
192 * Helper function for jpegSegmentSplitter
193 * @param resource &$fh File handle for JPEG file
194 * @throws InvalidJpegException
195 * @return string Data content of segment.
197 private static function jpegExtractMarker( &$fh ) {
198 try {
199 $size = StringUtils::unpack( "nint", fread( $fh, 2 ), 2 );
200 } catch ( UnpackFailedException $e ) {
201 throw new InvalidJpegException( $e->getMessage() );
203 if ( $size['int'] < 2 ) {
204 throw new InvalidJpegException( "invalid marker size in jpeg" );
206 if ( $size['int'] === 2 ) {
207 // fread( ..., 0 ) generates a warning
208 return '';
210 $segment = fread( $fh, $size['int'] - 2 );
211 if ( strlen( $segment ) !== $size['int'] - 2 ) {
212 throw new InvalidJpegException( "Segment shorter than expected" );
215 return $segment;
219 * This reads the photoshop image resource.
220 * Currently it only compares the iptc/iim hash
221 * with the stored hash, which is used to determine the precedence
222 * of the iptc data. In future it may extract some other info, like
223 * url of copyright license.
225 * This should generally be called by BitmapMetadataHandler::doApp13()
227 * @param string $app13 Photoshop psir app13 block from jpg.
228 * @throws InvalidPSIRException
229 * @return string If the iptc hash is good or not. One of 'iptc-no-hash',
230 * 'iptc-good-hash', 'iptc-bad-hash'.
232 public static function doPSIR( $app13 ) {
233 if ( !$app13 ) {
234 throw new InvalidPSIRException( "No App13 segment given" );
236 // First compare hash with real thing
237 // 0x404 contains IPTC, 0x425 has hash
238 // This is used to determine if the iptc is newer than
239 // the xmp data, as xmp programs update the hash,
240 // where non-xmp programs don't.
242 $offset = 14; // skip past PHOTOSHOP 3.0 identifier. should already be checked.
243 $appLen = strlen( $app13 );
244 $realHash = "";
245 $recordedHash = "";
247 // the +12 is the length of an empty item.
248 while ( $offset + 12 <= $appLen ) {
249 $valid = true;
250 if ( substr( $app13, $offset, 4 ) !== '8BIM' ) {
251 // it's supposed to be 8BIM
252 // but apparently sometimes isn't esp. in
253 // really old jpg's
254 $valid = false;
256 $offset += 4;
257 $id = substr( $app13, $offset, 2 );
258 // id is a 2 byte id number which identifies
259 // the piece of info this record contains.
261 $offset += 2;
263 // some record types can contain a name, which
264 // is a pascal string 0-padded to be an even
265 // number of bytes. Most times (and any time
266 // we care) this is empty, making it two null bytes.
268 $lenName = ord( substr( $app13, $offset, 1 ) ) + 1;
269 // we never use the name so skip it. +1 for length byte
270 if ( $lenName % 2 === 1 ) {
271 $lenName++;
272 } // pad to even.
273 $offset += $lenName;
275 // now length of data (unsigned long big endian)
276 try {
277 $lenData = StringUtils::unpack( 'Nlen', substr( $app13, $offset, 4 ), 4 );
278 } catch ( UnpackFailedException $e ) {
279 throw new InvalidPSIRException( $e->getMessage() );
281 // PHP can take issue with very large unsigned ints and make them negative.
282 // Which should never ever happen, as this has to be inside a segment
283 // which is limited to a 16 bit number.
284 if ( $lenData['len'] < 0 ) {
285 throw new InvalidPSIRException( "Too big PSIR (" . $lenData['len'] . ')' );
288 $offset += 4; // 4bytes length field;
290 // this should not happen, but check.
291 if ( $lenData['len'] + $offset > $appLen ) {
292 throw new InvalidPSIRException( "PSIR data too long. (item length=" . $lenData['len']
293 . "; offset=$offset; total length=$appLen)" );
296 if ( $valid ) {
297 switch ( $id ) {
298 case "\x04\x04":
299 // IPTC block
300 $realHash = md5( substr( $app13, $offset, $lenData['len'] ), true );
301 break;
302 case "\x04\x25":
303 $recordedHash = substr( $app13, $offset, $lenData['len'] );
304 break;
308 // if odd, add 1 to length to account for
309 // null pad byte.
310 if ( $lenData['len'] % 2 === 1 ) {
311 $lenData['len']++;
313 $offset += $lenData['len'];
316 if ( !$realHash || !$recordedHash ) {
317 return 'iptc-no-hash';
319 if ( $realHash === $recordedHash ) {
320 return 'iptc-good-hash';
322 /* if $realHash !== $recordedHash */
323 return 'iptc-bad-hash';