Refactoring the iff decoder
[deark.git] / modules / iptc.c
blob097d89874449167aeaeeac8363edb534d74eed56
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // IPTC-IIM metadata
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_iptc);
11 typedef struct localctx_struct {
12 // The coded character set defined in 1:90.
13 // This applied to records 2-6, and sometimes 8.
14 de_encoding charset;
15 } lctx;
17 struct ds_info;
19 typedef void (*ds_handler_fn)(deark *c, lctx *d, const struct ds_info *dsi,
20 i64 pos, i64 len);
22 struct ds_info {
23 u8 recnum;
24 u8 dsnum;
26 // 0x1 = A field consisting entirely of text ("graphic characters",
27 // "alphabetic characters", "numeric characters, "spaces", etc.)
28 u32 flags;
30 const char *dsname;
31 ds_handler_fn hfn;
34 static void handle_text(deark *c, lctx *d, const struct ds_info *dsi,
35 i64 pos, i64 len);
36 static void handle_uint16(deark *c, lctx *d, const struct ds_info *dsi,
37 i64 pos, i64 len);
38 static void handle_1_90(deark *c, lctx *d, const struct ds_info *dsi,
39 i64 pos, i64 len);
40 static void handle_2_120(deark *c, lctx *d, const struct ds_info *dsi,
41 i64 pos, i64 len);
42 static void handle_2_125(deark *c, lctx *d, const struct ds_info *dsi,
43 i64 pos, i64 len);
45 static const struct ds_info ds_info_arr[] = {
46 { 1, 0, 0, "Model Version", handle_uint16 },
47 { 1, 5, 0x0001, "Destination", NULL },
48 { 1, 20, 0, "File Format", handle_uint16 },
49 { 1, 22, 0, "File Format Version", handle_uint16 },
50 { 1, 30, 0x0001, "Service Identifier", NULL },
51 { 1, 40, 0x0001, "Envelope Number", NULL },
52 { 1, 50, 0x0001, "Product I.D.", NULL },
53 { 1, 60, 0x0001, "Envelope Priority", NULL },
54 { 1, 70, 0x0001, "Date Sent", NULL },
55 { 1, 80, 0x0001, "Time Sent", NULL },
56 { 1, 90, 0, "Coded Character Set", handle_1_90 },
57 { 1, 100, 0x0001, "UNO", NULL },
58 { 1, 120, 0, "ARM Identifier", handle_uint16 },
59 { 1, 122, 0, "ARM Version", handle_uint16 },
60 { 2, 0, 0, "Record Version", handle_uint16 },
61 { 2, 3, 0x0001, "Object Type Reference", NULL },
62 { 2, 4, 0x0001, "Object Attribute Reference", NULL },
63 { 2, 5, 0x0001, "Object Name", NULL },
64 { 2, 7, 0x0001, "Edit Status", NULL },
65 { 2, 8, 0x0001, "Editorial Update", NULL },
66 { 2, 10, 0x0001, "Urgency", NULL },
67 { 2, 12, 0x0001, "Subject Reference", NULL },
68 { 2, 15, 0x0001, "Category", NULL },
69 { 2, 20, 0x0001, "Supplemental Category", NULL },
70 { 2, 22, 0x0001, "Fixture Identifier", NULL },
71 { 2, 25, 0x0001, "Keywords", NULL },
72 { 2, 26, 0x0001, "Content Location Code", NULL },
73 { 2, 27, 0x0001, "Content Location Name", NULL },
74 { 2, 30, 0x0001, "Release Date", NULL },
75 { 2, 35, 0x0001, "Release Time", NULL },
76 { 2, 37, 0x0001, "Expiration Date", NULL },
77 { 2, 38, 0x0001, "Expiration Time", NULL },
78 { 2, 40, 0x0001, "Special Instructions", NULL },
79 { 2, 42, 0x0001, "Action Advised", NULL },
80 { 2, 45, 0x0001, "Reference Service", NULL },
81 { 2, 47, 0x0001, "Reference Date", NULL },
82 { 2, 50, 0x0001, "Reference Number", NULL },
83 { 2, 55, 0x0001, "Date Created", NULL },
84 { 2, 60, 0x0001, "Time Created", NULL },
85 { 2, 62, 0x0001, "Digital Creation Date", NULL },
86 { 2, 63, 0x0001, "Digital Creation Time", NULL },
87 { 2, 65, 0x0001, "Originating Program", NULL },
88 { 2, 70, 0x0001, "Program Version", NULL },
89 { 2, 75, 0x0001, "Object Cycle", NULL },
90 { 2, 80, 0x0001, "By-line", NULL },
91 { 2, 85, 0x0001, "By-line Title", NULL },
92 { 2, 90, 0x0001, "City", NULL },
93 { 2, 92, 0x0001, "Sub-location", NULL },
94 { 2, 95, 0x0001, "Province/State", NULL },
95 { 2, 100, 0x0001, "Country/Primary Location Code", NULL },
96 { 2, 101, 0x0001, "Country/Primary Location Name", NULL },
97 { 2, 103, 0x0001, "Original Transmission Reference", NULL },
98 { 2, 105, 0x0001, "Headline", NULL },
99 { 2, 110, 0x0001, "Credit", NULL },
100 { 2, 115, 0x0001, "Source", NULL },
101 { 2, 116, 0x0001, "Copyright Notice", NULL },
102 { 2, 118, 0x0001, "Contact", NULL },
103 { 2, 120, 0x0001, "Caption/Abstract", handle_2_120 },
104 { 2, 122, 0x0001, "Writer/Editor", NULL },
105 { 2, 125, 0, "Rasterized Caption", handle_2_125 },
106 { 2, 130, 0x0001, "Image Type", NULL },
107 { 2, 131, 0x0001, "Image Orientation", NULL },
108 { 2, 135, 0x0001, "Language Identifier", NULL },
109 { 2, 150, 0x0001, "Audio Type", NULL },
110 { 2, 151, 0x0001, "Audio Sampling Rate", NULL },
111 { 2, 152, 0x0001, "Audio Sampling Resolution", NULL },
112 { 2, 153, 0x0001, "Audio Duration", NULL },
113 { 2, 154, 0x0001, "Audio Outcue", NULL },
114 { 2, 200, 0, "ObjectData Preview File Format", handle_uint16 },
115 { 2, 201, 0, "ObjectData Preview File Format Version", handle_uint16 },
116 { 2, 202, 0, "ObjectData Preview Data", NULL },
117 // TODO: record 3
118 // TODO: record 6
119 { 7, 10, 0, "Size Mode", NULL },
120 { 7, 20, 0, "Max Subfile Size", NULL },
121 { 7, 90, 0, "ObjectData Size Announced", NULL },
122 { 7, 95, 0, "Maximum ObjectData Size", NULL },
123 { 8, 10, 0, "Subfile", NULL },
124 { 9, 10, 0, "Confirmed ObjectData Size", NULL }
127 static de_encoding get_ds_encoding(deark *c, lctx *d, u8 recnum)
129 if(recnum>=2 && recnum<=6) {
130 return d->charset;
132 return DE_ENCODING_UNKNOWN;
135 static void handle_1_90(deark *c, lctx *d, const struct ds_info *dsi,
136 i64 pos, i64 len)
138 const char *csname;
140 d->charset = DE_ENCODING_UNKNOWN;
142 // TODO: Fully interpret this field.
144 if(len>=3 && !dbuf_memcmp(c->infile, pos, "\x1b\x25\x47", 3)) {
145 d->charset = DE_ENCODING_UTF8;
148 if(d->charset==DE_ENCODING_UTF8)
149 csname="utf-8";
150 else
151 csname="unknown";
153 de_dbg(c, "charset: %s", csname);
156 // Caption/abstract
157 static void handle_2_120(deark *c, lctx *d, const struct ds_info *dsi,
158 i64 pos, i64 len)
160 de_ucstring *s = NULL;
161 dbuf *outf = NULL;
162 de_encoding encoding;
163 const char *fntoken;
165 if(c->extract_level<2) {
166 handle_text(c, d, dsi, pos, len);
167 goto done;
170 // Note that this code for extracting captions while decoding IPTC is not
171 // easily reached, because it requires the -a option, and -a usually causes
172 // IPTC to be extracted instead of decoded.
173 // It can be reached when reading a raw IPTC file, or when using
174 // "-a -opt extractiptc=0".
176 fntoken = "caption.txt";
178 encoding = get_ds_encoding(c, d, dsi->recnum);
179 if(encoding==DE_ENCODING_UNKNOWN) {
180 // If the encoding is unknown, copy the raw bytes.
181 dbuf_create_file_from_slice(c->infile, pos, len, fntoken,
182 NULL, DE_CREATEFLAG_IS_AUX);
183 goto done;
186 // If the encoding is known, convert to UTF-8.
187 s = ucstring_create(c);
188 dbuf_read_to_ucstring(c->infile, pos, len, s, 0, encoding);
189 outf = dbuf_create_output_file(c, fntoken, NULL, DE_CREATEFLAG_IS_AUX);
190 ucstring_write_as_utf8(c, s, outf, 1);
192 done:
193 if(outf) dbuf_close(outf);
194 if(s) ucstring_destroy(s);
197 // Rasterized caption
198 static void handle_2_125(deark *c, lctx *d, const struct ds_info *dsi,
199 i64 pos, i64 len)
201 de_bitmap *img = NULL;
202 i64 rowspan;
203 i64 width1, height1;
205 // I can't find any examples of this field, so this may not be correct.
206 // The format seems to be well-documented, though the pixels are in an
207 // unusual order.
209 width1 = 128; // (Will be rotated to be 460x128)
210 height1 = 460;
211 rowspan = width1/8;
212 if(len < rowspan * height1) goto done;
213 img = de_bitmap_create(c, width1, height1, 1);
214 de_convert_image_bilevel(c->infile, pos, rowspan, img, DE_CVTF_WHITEISZERO);
215 de_bitmap_transpose(img);
216 de_bitmap_write_to_file(img, "caption", DE_CREATEFLAG_IS_AUX|DE_CREATEFLAG_FLIP_IMAGE);
217 done:
218 de_bitmap_destroy(img);
221 // Caller supplies dsi. This function will set its fields.
222 static int lookup_ds_info(u8 recnum, u8 dsnum, struct ds_info *dsi)
224 size_t i;
226 de_zeromem(dsi, sizeof(struct ds_info));
228 for(i=0; i<DE_ARRAYCOUNT(ds_info_arr); i++) {
229 if(ds_info_arr[i].recnum==recnum && ds_info_arr[i].dsnum==dsnum) {
230 *dsi = ds_info_arr[i]; // struct copy
231 return 1;
235 // Not found
236 dsi->recnum = recnum;
237 dsi->dsnum = dsnum;
238 dsi->dsname = "?";
239 return 0;
242 static int read_dflen(deark *c, dbuf *f, i64 pos,
243 i64 *dflen, i64 *bytes_consumed)
245 i64 x;
247 x = dbuf_getu16be(f, pos);
248 if(x<32768) { // "Standard DataSet" format
249 *dflen = x;
250 *bytes_consumed = 2;
252 else { // "Extended DataSet" format
253 i64 length_of_length;
254 i64 i;
256 length_of_length = x - 32768;
257 *dflen = 0;
258 *bytes_consumed = 2 + length_of_length;
260 for(i=0; i<length_of_length; i++) {
261 *dflen = ((*dflen)<<8) | dbuf_getbyte(f, pos+2+i);
263 // IPTC seems to support fields up to (2^262136)-1 bytes.
264 // We arbitrarily limit it (2^48)-1.
265 if((*dflen)>=0x1000000000000LL) {
266 de_err(c, "Bad or unsupported IPTC data field length");
267 return 0;
272 return 1;
275 static void handle_text(deark *c, lctx *d, const struct ds_info *dsi,
276 i64 pos, i64 len)
278 de_encoding encoding;
279 de_ucstring *s = NULL;
281 s = ucstring_create(c);
282 encoding = get_ds_encoding(c, d, dsi->recnum);
283 if(encoding==DE_ENCODING_UNKNOWN)
284 encoding = DE_ENCODING_ASCII;
285 dbuf_read_to_ucstring(c->infile, pos, len, s, 0, encoding);
286 de_dbg(c, "%s: \"%s\"", dsi->dsname, ucstring_getpsz_d(s));
287 ucstring_destroy(s);
290 static void handle_uint16(deark *c, lctx *d, const struct ds_info *dsi,
291 i64 pos, i64 len)
293 i64 x;
294 if(len!=2) return;
295 x = de_getu16be(pos);
296 de_dbg(c, "%s: %d", dsi->dsname, (int)x);
299 static int do_dataset(deark *c, lctx *d, i64 ds_idx, i64 pos1,
300 i64 *bytes_consumed)
302 u8 b;
303 u8 recnum, dsnum;
304 int retval = 0;
305 i64 pos = pos1;
306 i64 dflen;
307 i64 dflen_bytes_consumed;
308 struct ds_info dsi;
309 int ds_known;
311 *bytes_consumed = 0;
313 b = de_getbyte(pos);
314 if(b!=0x1c) {
315 if(b==0x00 && ds_idx>0) {
316 // Extraneous padding at the end of data?
317 if(pos == c->infile->len-1) {
318 de_dbg(c, "[ignoring extra byte at end of IPTC-IIM data]");
320 else {
321 de_warn(c, "Expected %"I64_FMT" bytes of IPTC-IIM data, only found %"I64_FMT,
322 c->infile->len, pos);
325 else {
326 de_err(c, "Bad IPTC tag marker (0x%02x) at %d", (int)b, (int)pos);
328 goto done;
330 pos++;
332 recnum = de_getbyte(pos++);
333 dsnum = de_getbyte(pos++);
335 ds_known = lookup_ds_info(recnum, dsnum, &dsi);
337 if(!read_dflen(c, c->infile, pos, &dflen, &dflen_bytes_consumed)) goto done;
338 pos += dflen_bytes_consumed;
340 de_dbg(c, "IPTC dataset %d:%02d (%s) dpos=%" I64_FMT " dlen=%" I64_FMT "",
341 (int)recnum, (int)dsnum, dsi.dsname, pos, dflen);
343 // Decode the value
344 de_dbg_indent(c, 1);
346 if(dsi.hfn) {
347 dsi.hfn(c, d, &dsi, pos, dflen);
349 else if(dsi.flags&0x1) {
350 handle_text(c, d, &dsi, pos, dflen);
352 else if(dsi.recnum==2 && !ds_known) {
353 // Unknown record-2 datasets often contain readable text.
354 handle_text(c, d, &dsi, pos, dflen);
356 pos += dflen;
358 de_dbg_indent(c, -1);
361 *bytes_consumed = pos - pos1;
362 retval = 1;
363 done:
364 return retval;
367 static void de_run_iptc(deark *c, de_module_params *mparams)
369 lctx *d = NULL;
370 i64 pos;
371 i64 bytes_consumed;
372 i64 ds_count;
374 d = de_malloc(c, sizeof(lctx));
375 d->charset = DE_ENCODING_UNKNOWN;
377 pos = 0;
378 ds_count = 0;
379 while(1) {
380 if(pos>=c->infile->len) break;
381 if(!do_dataset(c, d, ds_count, pos, &bytes_consumed)) break;
382 if(bytes_consumed<=0) break;
383 ds_count++;
384 pos += bytes_consumed;
387 de_free(c, d);
390 static int de_identify_iptc(deark *c)
392 u8 b;
394 // First byte of each dataset is 0x1c.
395 if(de_getbyte(0)!=0x1c) return 0;
397 // Check the record number. Record numbers 1-9 are known.
398 b = de_getbyte(1);
399 if(b<1 || b>15) return 0;
401 // This is not meant to imply that .iptc is an official file extension for
402 // IPTC data. It's just that it's sometimes used by Deark when extracting
403 // IPTC data to a file.
404 if(!de_input_file_has_ext(c, "iptc")) return 0;
406 return 60;
409 void de_module_iptc(deark *c, struct deark_module_info *mi)
411 mi->id = "iptc";
412 mi->desc = "IPTC-IIM metadata";
413 mi->run_fn = de_run_iptc;
414 mi->identify_fn = de_identify_iptc;