1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_iptc
);
11 typedef struct localctx_struct
{
12 // The coded character set defined in 1:90.
13 // This applied to records 2-6, and sometimes 8.
19 typedef void (*ds_handler_fn
)(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
26 // 0x1 = A field consisting entirely of text ("graphic characters",
27 // "alphabetic characters", "numeric characters, "spaces", etc.)
34 static void handle_text(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
36 static void handle_uint16(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
38 static void handle_1_90(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
40 static void handle_2_120(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
42 static void handle_2_125(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
45 static const struct ds_info ds_info_arr
[] = {
46 { 1, 0, 0, "Model Version", handle_uint16
},
47 { 1, 5, 0x0001, "Destination", NULL
},
48 { 1, 20, 0, "File Format", handle_uint16
},
49 { 1, 22, 0, "File Format Version", handle_uint16
},
50 { 1, 30, 0x0001, "Service Identifier", NULL
},
51 { 1, 40, 0x0001, "Envelope Number", NULL
},
52 { 1, 50, 0x0001, "Product I.D.", NULL
},
53 { 1, 60, 0x0001, "Envelope Priority", NULL
},
54 { 1, 70, 0x0001, "Date Sent", NULL
},
55 { 1, 80, 0x0001, "Time Sent", NULL
},
56 { 1, 90, 0, "Coded Character Set", handle_1_90
},
57 { 1, 100, 0x0001, "UNO", NULL
},
58 { 1, 120, 0, "ARM Identifier", handle_uint16
},
59 { 1, 122, 0, "ARM Version", handle_uint16
},
60 { 2, 0, 0, "Record Version", handle_uint16
},
61 { 2, 3, 0x0001, "Object Type Reference", NULL
},
62 { 2, 4, 0x0001, "Object Attribute Reference", NULL
},
63 { 2, 5, 0x0001, "Object Name", NULL
},
64 { 2, 7, 0x0001, "Edit Status", NULL
},
65 { 2, 8, 0x0001, "Editorial Update", NULL
},
66 { 2, 10, 0x0001, "Urgency", NULL
},
67 { 2, 12, 0x0001, "Subject Reference", NULL
},
68 { 2, 15, 0x0001, "Category", NULL
},
69 { 2, 20, 0x0001, "Supplemental Category", NULL
},
70 { 2, 22, 0x0001, "Fixture Identifier", NULL
},
71 { 2, 25, 0x0001, "Keywords", NULL
},
72 { 2, 26, 0x0001, "Content Location Code", NULL
},
73 { 2, 27, 0x0001, "Content Location Name", NULL
},
74 { 2, 30, 0x0001, "Release Date", NULL
},
75 { 2, 35, 0x0001, "Release Time", NULL
},
76 { 2, 37, 0x0001, "Expiration Date", NULL
},
77 { 2, 38, 0x0001, "Expiration Time", NULL
},
78 { 2, 40, 0x0001, "Special Instructions", NULL
},
79 { 2, 42, 0x0001, "Action Advised", NULL
},
80 { 2, 45, 0x0001, "Reference Service", NULL
},
81 { 2, 47, 0x0001, "Reference Date", NULL
},
82 { 2, 50, 0x0001, "Reference Number", NULL
},
83 { 2, 55, 0x0001, "Date Created", NULL
},
84 { 2, 60, 0x0001, "Time Created", NULL
},
85 { 2, 62, 0x0001, "Digital Creation Date", NULL
},
86 { 2, 63, 0x0001, "Digital Creation Time", NULL
},
87 { 2, 65, 0x0001, "Originating Program", NULL
},
88 { 2, 70, 0x0001, "Program Version", NULL
},
89 { 2, 75, 0x0001, "Object Cycle", NULL
},
90 { 2, 80, 0x0001, "By-line", NULL
},
91 { 2, 85, 0x0001, "By-line Title", NULL
},
92 { 2, 90, 0x0001, "City", NULL
},
93 { 2, 92, 0x0001, "Sub-location", NULL
},
94 { 2, 95, 0x0001, "Province/State", NULL
},
95 { 2, 100, 0x0001, "Country/Primary Location Code", NULL
},
96 { 2, 101, 0x0001, "Country/Primary Location Name", NULL
},
97 { 2, 103, 0x0001, "Original Transmission Reference", NULL
},
98 { 2, 105, 0x0001, "Headline", NULL
},
99 { 2, 110, 0x0001, "Credit", NULL
},
100 { 2, 115, 0x0001, "Source", NULL
},
101 { 2, 116, 0x0001, "Copyright Notice", NULL
},
102 { 2, 118, 0x0001, "Contact", NULL
},
103 { 2, 120, 0x0001, "Caption/Abstract", handle_2_120
},
104 { 2, 122, 0x0001, "Writer/Editor", NULL
},
105 { 2, 125, 0, "Rasterized Caption", handle_2_125
},
106 { 2, 130, 0x0001, "Image Type", NULL
},
107 { 2, 131, 0x0001, "Image Orientation", NULL
},
108 { 2, 135, 0x0001, "Language Identifier", NULL
},
109 { 2, 150, 0x0001, "Audio Type", NULL
},
110 { 2, 151, 0x0001, "Audio Sampling Rate", NULL
},
111 { 2, 152, 0x0001, "Audio Sampling Resolution", NULL
},
112 { 2, 153, 0x0001, "Audio Duration", NULL
},
113 { 2, 154, 0x0001, "Audio Outcue", NULL
},
114 { 2, 200, 0, "ObjectData Preview File Format", handle_uint16
},
115 { 2, 201, 0, "ObjectData Preview File Format Version", handle_uint16
},
116 { 2, 202, 0, "ObjectData Preview Data", NULL
},
119 { 7, 10, 0, "Size Mode", NULL
},
120 { 7, 20, 0, "Max Subfile Size", NULL
},
121 { 7, 90, 0, "ObjectData Size Announced", NULL
},
122 { 7, 95, 0, "Maximum ObjectData Size", NULL
},
123 { 8, 10, 0, "Subfile", NULL
},
124 { 9, 10, 0, "Confirmed ObjectData Size", NULL
}
127 static de_encoding
get_ds_encoding(deark
*c
, lctx
*d
, u8 recnum
)
129 if(recnum
>=2 && recnum
<=6) {
132 return DE_ENCODING_UNKNOWN
;
135 static void handle_1_90(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
140 d
->charset
= DE_ENCODING_UNKNOWN
;
142 // TODO: Fully interpret this field.
144 if(len
>=3 && !dbuf_memcmp(c
->infile
, pos
, "\x1b\x25\x47", 3)) {
145 d
->charset
= DE_ENCODING_UTF8
;
148 if(d
->charset
==DE_ENCODING_UTF8
)
153 de_dbg(c
, "charset: %s", csname
);
157 static void handle_2_120(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
160 de_ucstring
*s
= NULL
;
162 de_encoding encoding
;
165 if(c
->extract_level
<2) {
166 handle_text(c
, d
, dsi
, pos
, len
);
170 // Note that this code for extracting captions while decoding IPTC is not
171 // easily reached, because it requires the -a option, and -a usually causes
172 // IPTC to be extracted instead of decoded.
173 // It can be reached when reading a raw IPTC file, or when using
174 // "-a -opt extractiptc=0".
176 fntoken
= "caption.txt";
178 encoding
= get_ds_encoding(c
, d
, dsi
->recnum
);
179 if(encoding
==DE_ENCODING_UNKNOWN
) {
180 // If the encoding is unknown, copy the raw bytes.
181 dbuf_create_file_from_slice(c
->infile
, pos
, len
, fntoken
,
182 NULL
, DE_CREATEFLAG_IS_AUX
);
186 // If the encoding is known, convert to UTF-8.
187 s
= ucstring_create(c
);
188 dbuf_read_to_ucstring(c
->infile
, pos
, len
, s
, 0, encoding
);
189 outf
= dbuf_create_output_file(c
, fntoken
, NULL
, DE_CREATEFLAG_IS_AUX
);
190 ucstring_write_as_utf8(c
, s
, outf
, 1);
193 if(outf
) dbuf_close(outf
);
194 if(s
) ucstring_destroy(s
);
197 // Rasterized caption
198 static void handle_2_125(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
201 de_bitmap
*img
= NULL
;
205 // I can't find any examples of this field, so this may not be correct.
206 // The format seems to be well-documented, though the pixels are in an
209 width1
= 128; // (Will be rotated to be 460x128)
212 if(len
< rowspan
* height1
) goto done
;
213 img
= de_bitmap_create(c
, width1
, height1
, 1);
214 de_convert_image_bilevel(c
->infile
, pos
, rowspan
, img
, DE_CVTF_WHITEISZERO
);
215 de_bitmap_transpose(img
);
216 de_bitmap_write_to_file(img
, "caption", DE_CREATEFLAG_IS_AUX
|DE_CREATEFLAG_FLIP_IMAGE
);
218 de_bitmap_destroy(img
);
221 // Caller supplies dsi. This function will set its fields.
222 static int lookup_ds_info(u8 recnum
, u8 dsnum
, struct ds_info
*dsi
)
226 de_zeromem(dsi
, sizeof(struct ds_info
));
228 for(i
=0; i
<DE_ARRAYCOUNT(ds_info_arr
); i
++) {
229 if(ds_info_arr
[i
].recnum
==recnum
&& ds_info_arr
[i
].dsnum
==dsnum
) {
230 *dsi
= ds_info_arr
[i
]; // struct copy
236 dsi
->recnum
= recnum
;
242 static int read_dflen(deark
*c
, dbuf
*f
, i64 pos
,
243 i64
*dflen
, i64
*bytes_consumed
)
247 x
= dbuf_getu16be(f
, pos
);
248 if(x
<32768) { // "Standard DataSet" format
252 else { // "Extended DataSet" format
253 i64 length_of_length
;
256 length_of_length
= x
- 32768;
258 *bytes_consumed
= 2 + length_of_length
;
260 for(i
=0; i
<length_of_length
; i
++) {
261 *dflen
= ((*dflen
)<<8) | dbuf_getbyte(f
, pos
+2+i
);
263 // IPTC seems to support fields up to (2^262136)-1 bytes.
264 // We arbitrarily limit it (2^48)-1.
265 if((*dflen
)>=0x1000000000000LL
) {
266 de_err(c
, "Bad or unsupported IPTC data field length");
275 static void handle_text(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
278 de_encoding encoding
;
279 de_ucstring
*s
= NULL
;
281 s
= ucstring_create(c
);
282 encoding
= get_ds_encoding(c
, d
, dsi
->recnum
);
283 if(encoding
==DE_ENCODING_UNKNOWN
)
284 encoding
= DE_ENCODING_ASCII
;
285 dbuf_read_to_ucstring(c
->infile
, pos
, len
, s
, 0, encoding
);
286 de_dbg(c
, "%s: \"%s\"", dsi
->dsname
, ucstring_getpsz_d(s
));
290 static void handle_uint16(deark
*c
, lctx
*d
, const struct ds_info
*dsi
,
295 x
= de_getu16be(pos
);
296 de_dbg(c
, "%s: %d", dsi
->dsname
, (int)x
);
299 static int do_dataset(deark
*c
, lctx
*d
, i64 ds_idx
, i64 pos1
,
307 i64 dflen_bytes_consumed
;
315 if(b
==0x00 && ds_idx
>0) {
316 // Extraneous padding at the end of data?
317 if(pos
== c
->infile
->len
-1) {
318 de_dbg(c
, "[ignoring extra byte at end of IPTC-IIM data]");
321 de_warn(c
, "Expected %"I64_FMT
" bytes of IPTC-IIM data, only found %"I64_FMT
,
322 c
->infile
->len
, pos
);
326 de_err(c
, "Bad IPTC tag marker (0x%02x) at %d", (int)b
, (int)pos
);
332 recnum
= de_getbyte(pos
++);
333 dsnum
= de_getbyte(pos
++);
335 ds_known
= lookup_ds_info(recnum
, dsnum
, &dsi
);
337 if(!read_dflen(c
, c
->infile
, pos
, &dflen
, &dflen_bytes_consumed
)) goto done
;
338 pos
+= dflen_bytes_consumed
;
340 de_dbg(c
, "IPTC dataset %d:%02d (%s) dpos=%" I64_FMT
" dlen=%" I64_FMT
"",
341 (int)recnum
, (int)dsnum
, dsi
.dsname
, pos
, dflen
);
347 dsi
.hfn(c
, d
, &dsi
, pos
, dflen
);
349 else if(dsi
.flags
&0x1) {
350 handle_text(c
, d
, &dsi
, pos
, dflen
);
352 else if(dsi
.recnum
==2 && !ds_known
) {
353 // Unknown record-2 datasets often contain readable text.
354 handle_text(c
, d
, &dsi
, pos
, dflen
);
358 de_dbg_indent(c
, -1);
361 *bytes_consumed
= pos
- pos1
;
367 static void de_run_iptc(deark
*c
, de_module_params
*mparams
)
374 d
= de_malloc(c
, sizeof(lctx
));
375 d
->charset
= DE_ENCODING_UNKNOWN
;
380 if(pos
>=c
->infile
->len
) break;
381 if(!do_dataset(c
, d
, ds_count
, pos
, &bytes_consumed
)) break;
382 if(bytes_consumed
<=0) break;
384 pos
+= bytes_consumed
;
390 static int de_identify_iptc(deark
*c
)
394 // First byte of each dataset is 0x1c.
395 if(de_getbyte(0)!=0x1c) return 0;
397 // Check the record number. Record numbers 1-9 are known.
399 if(b
<1 || b
>15) return 0;
401 // This is not meant to imply that .iptc is an official file extension for
402 // IPTC data. It's just that it's sometimes used by Deark when extracting
403 // IPTC data to a file.
404 if(!de_input_file_has_ext(c
, "iptc")) return 0;
409 void de_module_iptc(deark
*c
, struct deark_module_info
*mi
)
412 mi
->desc
= "IPTC-IIM metadata";
413 mi
->run_fn
= de_run_iptc
;
414 mi
->identify_fn
= de_identify_iptc
;