Non-word characters don't terminate tag names.
[mediawiki.git] / includes / media / IPTC.php
blob544dd21152782837d81a8cb8ae005f1adab57e21
1 <?php
2 /**
3 * Class for some IPTC functions.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
20 * @file
21 * @ingroup Media
24 /**
25 * Class for some IPTC functions.
27 * @ingroup Media
29 class IPTC {
31 /**
32 * This takes the results of iptcparse() and puts it into a
33 * form that can be handled by mediawiki. Generally called from
34 * BitmapMetadataHandler::doApp13.
36 * @see http://www.iptc.org/std/IIM/4.1/specification/IIMV4.1.pdf
38 * @param string $rawData app13 block from jpeg containing iptc/iim data
39 * @return Array iptc metadata array
41 static function parse( $rawData ) {
42 $parsed = iptcparse( $rawData );
43 $data = Array();
44 if ( !is_array( $parsed ) ) {
45 return $data;
48 $c = '';
49 //charset info contained in tag 1:90.
50 if ( isset( $parsed['1#090'] ) && isset( $parsed['1#090'][0] ) ) {
51 $c = self::getCharset( $parsed['1#090'][0] );
52 if ( $c === false ) {
53 //Unknown charset. refuse to parse.
54 //note: There is a different between
55 //unknown and no charset specified.
56 return array();
58 unset( $parsed['1#090'] );
61 foreach ( $parsed as $tag => $val ) {
62 if ( isset( $val[0] ) && trim( $val[0] ) == '' ) {
63 wfDebugLog( 'iptc', "IPTC tag $tag had only whitespace as its value." );
64 continue;
66 switch ( $tag ) {
67 case '2#120': /*IPTC caption. mapped with exif ImageDescription*/
68 $data['ImageDescription'] = self::convIPTC( $val, $c );
69 break;
70 case '2#116': /* copyright. Mapped with exif copyright */
71 $data['Copyright'] = self::convIPTC( $val, $c );
72 break;
73 case '2#080': /* byline. Mapped with exif Artist */
74 /* merge with byline title (2:85)
75 * like how exif does it with
76 * Title, person. Not sure if this is best
77 * approach since we no longer have the two fields
78 * separate. each byline title entry corresponds to a
79 * specific byline. */
81 $bylines = self::convIPTC( $val, $c );
82 if ( isset( $parsed['2#085'] ) ) {
83 $titles = self::convIPTC( $parsed['2#085'], $c );
84 } else {
85 $titles = array();
88 for ( $i = 0; $i < count( $titles ); $i++ ) {
89 if ( isset( $bylines[$i] ) ) {
90 // theoretically this should always be set
91 // but doesn't hurt to be careful.
92 $bylines[$i] = $titles[$i] . ', ' . $bylines[$i];
95 $data['Artist'] = $bylines;
96 break;
97 case '2#025': /* keywords */
98 $data['Keywords'] = self::convIPTC( $val, $c );
99 break;
100 case '2#101': /* Country (shown)*/
101 $data['CountryDest'] = self::convIPTC( $val, $c );
102 break;
103 case '2#095': /* state/province (shown) */
104 $data['ProvinceOrStateDest'] = self::convIPTC( $val, $c );
105 break;
106 case '2#090': /* city (Shown) */
107 $data['CityDest'] = self::convIPTC( $val, $c );
108 break;
109 case '2#092': /* sublocation (shown) */
110 $data['SublocationDest'] = self::convIPTC( $val, $c );
111 break;
112 case '2#005': /* object name/title */
113 $data['ObjectName'] = self::convIPTC( $val, $c );
114 break;
115 case '2#040': /* special instructions */
116 $data['SpecialInstructions'] = self::convIPTC( $val, $c );
117 break;
118 case '2#105': /* headline*/
119 $data['Headline'] = self::convIPTC( $val, $c );
120 break;
121 case '2#110': /* credit */
122 /*"Identifies the provider of the objectdata,
123 * not necessarily the owner/creator". */
124 $data['Credit'] = self::convIPTC( $val, $c );
125 break;
126 case '2#115': /* source */
127 /* "Identifies the original owner of the intellectual content of the
128 *objectdata. This could be an agency, a member of an agency or
129 *an individual." */
130 $data['Source'] = self::convIPTC( $val, $c );
131 break;
133 case '2#007': /* edit status (lead, correction, etc) */
134 $data['EditStatus'] = self::convIPTC( $val, $c );
135 break;
136 case '2#015': /* category. deprecated. max 3 letters in theory, often more */
137 $data['iimCategory'] = self::convIPTC( $val, $c );
138 break;
139 case '2#020': /* category. deprecated. */
140 $data['iimSupplementalCategory'] = self::convIPTC( $val, $c );
141 break;
142 case '2#010': /*urgency (1-8. 1 most, 5 normal, 8 low priority)*/
143 $data['Urgency'] = self::convIPTC( $val, $c );
144 break;
145 case '2#022':
146 /* "Identifies objectdata that recurs often and predictably...
147 * Example: Euroweather" */
148 $data['FixtureIdentifier'] = self::convIPTC( $val, $c );
149 break;
150 case '2#026':
151 /* Content location code (iso 3166 + some custom things)
152 * ex: TUR (for turkey), XUN (for UN), XSP (outer space)
153 * See wikipedia article on iso 3166 and appendix D of iim std. */
154 $data['LocationDestCode'] = self::convIPTC( $val, $c );
155 break;
156 case '2#027':
157 /* Content location name. Full printable name
158 * of location of photo. */
159 $data['LocationDest'] = self::convIPTC( $val, $c );
160 break;
161 case '2#065':
162 /* Originating Program.
163 * Combine with Program version (2:70) if present.
165 $software = self::convIPTC( $val, $c );
167 if ( count( $software ) !== 1 ) {
168 //according to iim standard this cannot have multiple values
169 //so if there is more than one, something weird is happening,
170 //and we skip it.
171 wfDebugLog( 'iptc', 'IPTC: Wrong count on 2:65 Software field' );
172 break;
175 if ( isset( $parsed['2#070'] ) ) {
176 //if a version is set for the software.
177 $softwareVersion = self::convIPTC( $parsed['2#070'], $c );
178 unset( $parsed['2#070'] );
179 $data['Software'] = array( array( $software[0], $softwareVersion[0] ) );
180 } else {
181 $data['Software'] = $software;
183 break;
184 case '2#075':
185 /* Object cycle.
186 * a for morning (am), p for evening, b for both */
187 $data['ObjectCycle'] = self::convIPTC( $val, $c );
188 break;
189 case '2#100':
190 /* Country/Primary location code.
191 * "Indicates the code of the country/primary location where the
192 * intellectual property of the objectdata was created"
193 * unclear how this differs from 2#026
195 $data['CountryCodeDest'] = self::convIPTC( $val, $c );
196 break;
197 case '2#103':
198 /* original transmission ref.
199 * "A code representing the location of original transmission ac-
200 * cording to practises of the provider."
202 $data['OriginalTransmissionRef'] = self::convIPTC( $val, $c );
203 break;
204 case '2#118': /*contact*/
205 $data['Contact'] = self::convIPTC( $val, $c );
206 break;
207 case '2#122':
208 /* Writer/Editor
209 * "Identification of the name of the person involved in the writing,
210 * editing or correcting the objectdata or caption/abstract."
212 $data['Writer'] = self::convIPTC( $val, $c );
213 break;
214 case '2#135': /* lang code */
215 $data['LanguageCode'] = self::convIPTC( $val, $c );
216 break;
218 // Start date stuff.
219 // It doesn't accept incomplete dates even though they are valid
220 // according to spec.
221 // Should potentially store timezone as well.
222 case '2#055':
223 //Date created (not date digitized).
224 //Maps to exif DateTimeOriginal
225 if ( isset( $parsed['2#060'] ) ) {
226 $time = $parsed['2#060'];
227 } else {
228 $time = Array();
230 $timestamp = self::timeHelper( $val, $time, $c );
231 if ( $timestamp ) {
232 $data['DateTimeOriginal'] = $timestamp;
234 break;
236 case '2#062':
237 //Date converted to digital representation.
238 //Maps to exif DateTimeDigitized
239 if ( isset( $parsed['2#063'] ) ) {
240 $time = $parsed['2#063'];
241 } else {
242 $time = Array();
244 $timestamp = self::timeHelper( $val, $time, $c );
245 if ( $timestamp ) {
246 $data['DateTimeDigitized'] = $timestamp;
248 break;
250 case '2#030':
251 //Date released.
252 if ( isset( $parsed['2#035'] ) ) {
253 $time = $parsed['2#035'];
254 } else {
255 $time = Array();
257 $timestamp = self::timeHelper( $val, $time, $c );
258 if ( $timestamp ) {
259 $data['DateTimeReleased'] = $timestamp;
261 break;
263 case '2#037':
264 //Date expires.
265 if ( isset( $parsed['2#038'] ) ) {
266 $time = $parsed['2#038'];
267 } else {
268 $time = Array();
270 $timestamp = self::timeHelper( $val, $time, $c );
271 if ( $timestamp ) {
272 $data['DateTimeExpires'] = $timestamp;
274 break;
276 case '2#000': /* iim version */
277 // unlike other tags, this is a 2-byte binary number.
278 //technically this is required if there is iptc data
279 //but in practise it isn't always there.
280 if ( strlen( $val[0] ) == 2 ) {
281 //if is just to be paranoid.
282 $versionValue = ord( substr( $val[0], 0, 1 ) ) * 256;
283 $versionValue += ord( substr( $val[0], 1, 1 ) );
284 $data['iimVersion'] = $versionValue;
286 break;
288 case '2#004':
289 // IntellectualGenere.
290 // first 4 characters are an id code
291 // That we're not really interested in.
293 // This prop is weird, since it's
294 // allowed to have multiple values
295 // in iim 4.1, but not in the XMP
296 // stuff. We're going to just
297 // extract the first value.
298 $con = self::ConvIPTC( $val, $c );
299 if ( strlen( $con[0] ) < 5 ) {
300 wfDebugLog( 'iptc', 'IPTC: '
301 . '2:04 too short. '
302 . 'Ignoring.' );
303 break;
305 $extracted = substr( $con[0], 4 );
306 $data['IntellectualGenre'] = $extracted;
307 break;
309 case '2#012':
310 // Subject News code - this is a compound field
311 // at the moment we only extract the subject news
312 // code, which is an 8 digit (ascii) number
313 // describing the subject matter of the content.
314 $codes = self::convIPTC( $val, $c );
315 foreach ( $codes as $ic ) {
316 $fields = explode( ':', $ic, 3 );
318 if ( count( $fields ) < 2 ||
319 $fields[0] !== 'IPTC' )
321 wfDebugLog( 'IPTC', 'IPTC: '
322 . 'Invalid 2:12 - ' . $ic );
323 break;
325 $data['SubjectNewsCode'] = $fields[1];
327 break;
329 // purposely does not do 2:125, 2:130, 2:131,
330 // 2:47, 2:50, 2:45, 2:42, 2:8, 2:3
331 // 2:200, 2:201, 2:202
332 // or the audio stuff (2:150 to 2:154)
334 case '2#070':
335 case '2#060':
336 case '2#063':
337 case '2#085':
338 case '2#038':
339 case '2#035':
340 //ignore. Handled elsewhere.
341 break;
343 default:
344 wfDebugLog( 'iptc', "Unsupported iptc tag: $tag. Value: " . implode( ',', $val ));
345 break;
349 return $data;
353 * Convert an iptc date and time tags into the exif format
355 * @todo Potentially this should also capture the timezone offset.
356 * @param array $date The date tag
357 * @param array $time The time tag
358 * @param $c
359 * @return String Date in exif format.
361 private static function timeHelper( $date, $time, $c ) {
362 if ( count( $date ) === 1 ) {
363 //the standard says this should always be 1
364 //just double checking.
365 list( $date ) = self::convIPTC( $date, $c );
366 } else {
367 return null;
370 if ( count( $time ) === 1 ) {
371 list( $time ) = self::convIPTC( $time, $c );
372 $dateOnly = false;
373 } else {
374 $time = '000000+0000'; //placeholder
375 $dateOnly = true;
378 if ( !( preg_match( '/\d\d\d\d\d\d[-+]\d\d\d\d/', $time )
379 && preg_match( '/\d\d\d\d\d\d\d\d/', $date )
380 && substr( $date, 0, 4 ) !== '0000'
381 && substr( $date, 4, 2 ) !== '00'
382 && substr( $date, 6, 2 ) !== '00'
383 ) ) {
384 //something wrong.
385 // Note, this rejects some valid dates according to iptc spec
386 // for example: the date 00000400 means the photo was taken in
387 // April, but the year and day is unknown. We don't process these
388 // types of incomplete dates atm.
389 wfDebugLog( 'iptc', "IPTC: invalid time ( $time ) or date ( $date )" );
390 return null;
393 $unixTS = wfTimestamp( TS_UNIX, $date . substr( $time, 0, 6 ));
394 if ( $unixTS === false ) {
395 wfDebugLog( 'iptc', "IPTC: can't convert date to TS_UNIX: $date $time." );
396 return null;
399 $tz = ( intval( substr( $time, 7, 2 ) ) * 60 * 60 )
400 + ( intval( substr( $time, 9, 2 ) ) * 60 );
402 if ( substr( $time, 6, 1 ) === '-' ) {
403 $tz = - $tz;
406 $finalTimestamp = wfTimestamp( TS_EXIF, $unixTS + $tz );
407 if ( $finalTimestamp === false ) {
408 wfDebugLog( 'iptc', "IPTC: can't make final timestamp. Date: " . ( $unixTS + $tz ) );
409 return null;
411 if ( $dateOnly ) {
412 //return the date only
413 return substr( $finalTimestamp, 0, 10 );
414 } else {
415 return $finalTimestamp;
420 * Helper function to convert charset for iptc values.
421 * @param string|array $data The iptc string
422 * @param string $charset The charset
424 * @return string|array
426 private static function convIPTC( $data, $charset ) {
427 if ( is_array( $data ) ) {
428 foreach ( $data as &$val ) {
429 $val = self::convIPTCHelper( $val, $charset );
431 } else {
432 $data = self::convIPTCHelper( $data, $charset );
435 return $data;
438 * Helper function of a helper function to convert charset for iptc values.
439 * @param $data Mixed String or Array: The iptc string
440 * @param string $charset The charset
442 * @return string
444 private static function convIPTCHelper( $data, $charset ) {
445 if ( $charset ) {
446 wfSuppressWarnings();
447 $data = iconv( $charset, "UTF-8//IGNORE", $data );
448 wfRestoreWarnings();
449 if ( $data === false ) {
450 $data = "";
451 wfDebugLog( 'iptc', __METHOD__ . " Error converting iptc data charset $charset to utf-8" );
453 } else {
454 //treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
455 // most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
456 $oldData = $data;
457 UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
458 if ( $data === $oldData ) {
459 return $data; //if validation didn't change $data
460 } else {
461 return self::convIPTCHelper( $oldData, 'Windows-1252' );
464 return trim( $data );
468 * take the value of 1:90 tag and returns a charset
469 * @param string $tag 1:90 tag.
470 * @return string charset name or "?"
471 * Warning, this function does not (and is not intended to) detect
472 * all iso 2022 escape codes. In practise, the code for utf-8 is the
473 * only code that seems to have wide use. It does detect that code.
475 static function getCharset( $tag ) {
477 //According to iim standard, charset is defined by the tag 1:90.
478 //in which there are iso 2022 escape sequences to specify the character set.
479 //the iim standard seems to encourage that all necessary escape sequences are
480 //in the 1:90 tag, but says it doesn't have to be.
482 //This is in need of more testing probably. This is definitely not complete.
483 //however reading the docs of some other iptc software, it appears that most iptc software
484 //only recognizes utf-8. If 1:90 tag is not present content is
485 // usually ascii or iso-8859-1 (and sometimes utf-8), but no guarantee.
487 //This also won't work if there are more than one escape sequence in the 1:90 tag
488 //or if something is put in the G2, or G3 charsets, etc. It will only reliably recognize utf-8.
490 // This is just going through the charsets mentioned in appendix C of the iim standard.
492 // \x1b = ESC.
493 switch ( $tag ) {
494 case "\x1b%G": //utf-8
495 //Also call things that are compatible with utf-8, utf-8 (e.g. ascii)
496 case "\x1b(B": // ascii
497 case "\x1b(@": // iso-646-IRV (ascii in latest version, $ different in older version)
498 $c = 'UTF-8';
499 break;
500 case "\x1b(A": //like ascii, but british.
501 $c = 'ISO646-GB';
502 break;
503 case "\x1b(C": //some obscure sweedish/finland encoding
504 $c = 'ISO-IR-8-1';
505 break;
506 case "\x1b(D":
507 $c = 'ISO-IR-8-2';
508 break;
509 case "\x1b(E": //some obscure danish/norway encoding
510 $c = 'ISO-IR-9-1';
511 break;
512 case "\x1b(F":
513 $c = 'ISO-IR-9-2';
514 break;
515 case "\x1b(G":
516 $c = 'SEN_850200_B'; // aka iso 646-SE; ascii-like
517 break;
518 case "\x1b(I":
519 $c = "ISO646-IT";
520 break;
521 case "\x1b(L":
522 $c = "ISO646-PT";
523 break;
524 case "\x1b(Z":
525 $c = "ISO646-ES";
526 break;
527 case "\x1b([":
528 $c = "GREEK7-OLD";
529 break;
530 case "\x1b(K":
531 $c = "ISO646-DE";
532 break;
533 case "\x1b(N": //crylic
534 $c = "ISO_5427";
535 break;
536 case "\x1b(`": //iso646-NO
537 $c = "NS_4551-1";
538 break;
539 case "\x1b(f": //iso646-FR
540 $c = "NF_Z_62-010";
541 break;
542 case "\x1b(g":
543 $c = "PT2"; //iso646-PT2
544 break;
545 case "\x1b(h":
546 $c = "ES2";
547 break;
548 case "\x1b(i": //iso646-HU
549 $c = "MSZ_7795.3";
550 break;
551 case "\x1b(w":
552 $c = "CSA_Z243.4-1985-1";
553 break;
554 case "\x1b(x":
555 $c = "CSA_Z243.4-1985-2";
556 break;
557 case "\x1b\$(B":
558 case "\x1b\$B":
559 case "\x1b&@\x1b\$B":
560 case "\x1b&@\x1b\$(B":
561 $c = "JIS_C6226-1983";
562 break;
563 case "\x1b-A": // iso-8859-1. at least for the high code characters.
564 case "\x1b(@\x1b-A":
565 case "\x1b(B\x1b-A":
566 $c = 'ISO-8859-1';
567 break;
568 case "\x1b-B": // iso-8859-2. at least for the high code characters.
569 $c = 'ISO-8859-2';
570 break;
571 case "\x1b-C": // iso-8859-3. at least for the high code characters.
572 $c = 'ISO-8859-3';
573 break;
574 case "\x1b-D": // iso-8859-4. at least for the high code characters.
575 $c = 'ISO-8859-4';
576 break;
577 case "\x1b-E": // iso-8859-5. at least for the high code characters.
578 $c = 'ISO-8859-5';
579 break;
580 case "\x1b-F": // iso-8859-6. at least for the high code characters.
581 $c = 'ISO-8859-6';
582 break;
583 case "\x1b-G": // iso-8859-7. at least for the high code characters.
584 $c = 'ISO-8859-7';
585 break;
586 case "\x1b-H": // iso-8859-8. at least for the high code characters.
587 $c = 'ISO-8859-8';
588 break;
589 case "\x1b-I": // CSN_369103. at least for the high code characters.
590 $c = 'CSN_369103';
591 break;
592 default:
593 wfDebugLog( 'iptc', __METHOD__ . 'Unknown charset in iptc 1:90: ' . bin2hex( $tag ) );
594 //at this point just give up and refuse to parse iptc?
595 $c = false;
597 return $c;