lzhuf: Refactored to avoid direct array access
[deark.git] / modules / iptc.c
blobcbf5b22156fa70acda49df862927f64a442f5186
1 // This file is part of Deark.
2 // Copyright (C) 2016 Jason Summers
3 // See the file COPYING for terms of use.
5 // IPTC-IIM metadata
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_iptc);
11 typedef struct localctx_struct {
12 // The coded character set defined in 1:90.
13 // This applied to records 2-6, and sometimes 8.
14 de_encoding charset;
15 } lctx;
17 struct ds_info;
19 typedef void (*ds_handler_fn)(deark *c, lctx *d, const struct ds_info *dsi,
20 i64 pos, i64 len);
22 struct ds_info {
23 u8 recnum;
24 u8 dsnum;
26 // 0x1 = A field consisting entirely of text ("graphic characters",
27 // "alphabetic characters", "numeric characters, "spaces", etc.)
28 u32 flags;
30 const char *dsname;
31 ds_handler_fn hfn;
34 static void handle_text(deark *c, lctx *d, const struct ds_info *dsi,
35 i64 pos, i64 len);
36 static void handle_uint16(deark *c, lctx *d, const struct ds_info *dsi,
37 i64 pos, i64 len);
38 static void handle_1_90(deark *c, lctx *d, const struct ds_info *dsi,
39 i64 pos, i64 len);
40 static void handle_2_120(deark *c, lctx *d, const struct ds_info *dsi,
41 i64 pos, i64 len);
42 static void handle_2_125(deark *c, lctx *d, const struct ds_info *dsi,
43 i64 pos, i64 len);
45 static const struct ds_info ds_info_arr[] = {
46 { 1, 0, 0, "Model Version", handle_uint16 },
47 { 1, 5, 0x0001, "Destination", NULL },
48 { 1, 20, 0, "File Format", handle_uint16 },
49 { 1, 22, 0, "File Format Version", handle_uint16 },
50 { 1, 30, 0x0001, "Service Identifier", NULL },
51 { 1, 40, 0x0001, "Envelope Number", NULL },
52 { 1, 50, 0x0001, "Product I.D.", NULL },
53 { 1, 60, 0x0001, "Envelope Priority", NULL },
54 { 1, 70, 0x0001, "Date Sent", NULL },
55 { 1, 80, 0x0001, "Time Sent", NULL },
56 { 1, 90, 0, "Coded Character Set", handle_1_90 },
57 { 1, 100, 0x0001, "UNO", NULL },
58 { 1, 120, 0, "ARM Identifier", handle_uint16 },
59 { 1, 122, 0, "ARM Version", handle_uint16 },
60 { 2, 0, 0, "Record Version", handle_uint16 },
61 { 2, 3, 0x0001, "Object Type Reference", NULL },
62 { 2, 4, 0x0001, "Object Attribute Reference", NULL },
63 { 2, 5, 0x0001, "Object Name", NULL },
64 { 2, 7, 0x0001, "Edit Status", NULL },
65 { 2, 8, 0x0001, "Editorial Update", NULL },
66 { 2, 10, 0x0001, "Urgency", NULL },
67 { 2, 12, 0x0001, "Subject Reference", NULL },
68 { 2, 15, 0x0001, "Category", NULL },
69 { 2, 20, 0x0001, "Supplemental Category", NULL },
70 { 2, 22, 0x0001, "Fixture Identifier", NULL },
71 { 2, 25, 0x0001, "Keywords", NULL },
72 { 2, 26, 0x0001, "Content Location Code", NULL },
73 { 2, 27, 0x0001, "Content Location Name", NULL },
74 { 2, 30, 0x0001, "Release Date", NULL },
75 { 2, 35, 0x0001, "Release Time", NULL },
76 { 2, 37, 0x0001, "Expiration Date", NULL },
77 { 2, 38, 0x0001, "Expiration Time", NULL },
78 { 2, 40, 0x0001, "Special Instructions", NULL },
79 { 2, 42, 0x0001, "Action Advised", NULL },
80 { 2, 45, 0x0001, "Reference Service", NULL },
81 { 2, 47, 0x0001, "Reference Date", NULL },
82 { 2, 50, 0x0001, "Reference Number", NULL },
83 { 2, 55, 0x0001, "Date Created", NULL },
84 { 2, 60, 0x0001, "Time Created", NULL },
85 { 2, 62, 0x0001, "Digital Creation Date", NULL },
86 { 2, 63, 0x0001, "Digital Creation Time", NULL },
87 { 2, 65, 0x0001, "Originating Program", NULL },
88 { 2, 70, 0x0001, "Program Version", NULL },
89 { 2, 75, 0x0001, "Object Cycle", NULL },
90 { 2, 80, 0x0001, "By-line", NULL },
91 { 2, 85, 0x0001, "By-line Title", NULL },
92 { 2, 90, 0x0001, "City", NULL },
93 { 2, 92, 0x0001, "Sub-location", NULL },
94 { 2, 95, 0x0001, "Province/State", NULL },
95 { 2, 100, 0x0001, "Country/Primary Location Code", NULL },
96 { 2, 101, 0x0001, "Country/Primary Location Name", NULL },
97 { 2, 103, 0x0001, "Original Transmission Reference", NULL },
98 { 2, 105, 0x0001, "Headline", NULL },
99 { 2, 110, 0x0001, "Credit", NULL },
100 { 2, 115, 0x0001, "Source", NULL },
101 { 2, 116, 0x0001, "Copyright Notice", NULL },
102 { 2, 118, 0x0001, "Contact", NULL },
103 { 2, 120, 0x0001, "Caption/Abstract", handle_2_120 },
104 { 2, 122, 0x0001, "Writer/Editor", NULL },
105 { 2, 125, 0, "Rasterized Caption", handle_2_125 },
106 { 2, 130, 0x0001, "Image Type", NULL },
107 { 2, 131, 0x0001, "Image Orientation", NULL },
108 { 2, 135, 0x0001, "Language Identifier", NULL },
109 { 2, 150, 0x0001, "Audio Type", NULL },
110 { 2, 151, 0x0001, "Audio Sampling Rate", NULL },
111 { 2, 152, 0x0001, "Audio Sampling Resolution", NULL },
112 { 2, 153, 0x0001, "Audio Duration", NULL },
113 { 2, 154, 0x0001, "Audio Outcue", NULL },
114 { 2, 200, 0, "ObjectData Preview File Format", handle_uint16 },
115 { 2, 201, 0, "ObjectData Preview File Format Version", handle_uint16 },
116 { 2, 202, 0, "ObjectData Preview Data", NULL },
117 // TODO: record 3
118 // TODO: record 6
119 { 7, 10, 0, "Size Mode", NULL },
120 { 7, 20, 0, "Max Subfile Size", NULL },
121 { 7, 90, 0, "ObjectData Size Announced", NULL },
122 { 7, 95, 0, "Maximum ObjectData Size", NULL },
123 { 8, 10, 0, "Subfile", NULL },
124 { 9, 10, 0, "Confirmed ObjectData Size", NULL }
127 static de_encoding get_ds_encoding(deark *c, lctx *d, u8 recnum)
129 if(recnum>=2 && recnum<=6) {
130 return d->charset;
132 return DE_ENCODING_UNKNOWN;
135 static void handle_1_90(deark *c, lctx *d, const struct ds_info *dsi,
136 i64 pos, i64 len)
138 const char *csname;
140 d->charset = DE_ENCODING_UNKNOWN;
142 // TODO: Fully interpret this field.
144 if(len>=3 && !dbuf_memcmp(c->infile, pos, "\x1b\x25\x47", 3)) {
145 d->charset = DE_ENCODING_UTF8;
148 if(d->charset==DE_ENCODING_UTF8)
149 csname="utf-8";
150 else
151 csname="unknown";
153 de_dbg(c, "charset: %s", csname);
156 // Caption/abstract
157 static void handle_2_120(deark *c, lctx *d, const struct ds_info *dsi,
158 i64 pos, i64 len)
160 de_ucstring *s = NULL;
161 dbuf *outf = NULL;
162 de_encoding encoding;
163 const char *fntoken;
165 if(c->extract_level<2) {
166 handle_text(c, d, dsi, pos, len);
167 goto done;
170 // Note that this code for extracting captions while decoding IPTC is not
171 // easily reached, because it requires the -a option, and -a usually causes
172 // IPTC to be extracted instead of decoded.
173 // It can be reached when reading a raw IPTC file, or when using
174 // "-a -opt extractiptc=0".
176 fntoken = "caption.txt";
178 encoding = get_ds_encoding(c, d, dsi->recnum);
179 if(encoding==DE_ENCODING_UNKNOWN) {
180 // If the encoding is unknown, copy the raw bytes.
181 dbuf_create_file_from_slice(c->infile, pos, len, fntoken,
182 NULL, DE_CREATEFLAG_IS_AUX);
183 goto done;
186 // If the encoding is known, convert to UTF-8.
187 s = ucstring_create(c);
188 dbuf_read_to_ucstring(c->infile, pos, len, s, 0, encoding);
189 outf = dbuf_create_output_file(c, fntoken, NULL, DE_CREATEFLAG_IS_AUX);
190 ucstring_write_as_utf8(c, s, outf, 1);
192 done:
193 if(outf) dbuf_close(outf);
194 if(s) ucstring_destroy(s);
197 // Rasterized caption
198 static void handle_2_125(deark *c, lctx *d, const struct ds_info *dsi,
199 i64 pos, i64 len)
201 dbuf *unc_pixels = NULL;
202 de_bitmap *img = NULL;
203 i64 i, j;
204 u8 b;
205 i64 rowspan;
206 i64 width, height;
208 // I can't find any examples of this field, so this may not be correct.
209 // The format seems to be well-documented, though the pixels are in an
210 // unusual order.
212 unc_pixels = dbuf_open_input_subfile(c->infile, pos, len);
213 width = 460;
214 height = 128;
215 img = de_bitmap_create(c, width, height, 1);
216 rowspan = height/8;
218 for(j=0; j<width; j++) {
219 for(i=0; i<height; i++) {
220 b = de_get_bits_symbol(unc_pixels, 1, rowspan*j, i);
221 de_bitmap_setpixel_gray(img, j, (height-1-i), b?0:255);
225 de_bitmap_write_to_file(img, "caption", DE_CREATEFLAG_IS_AUX);
226 de_bitmap_destroy(img);
227 dbuf_close(unc_pixels);
230 // Caller supplies dsi. This function will set its fields.
231 static int lookup_ds_info(u8 recnum, u8 dsnum, struct ds_info *dsi)
233 size_t i;
235 de_zeromem(dsi, sizeof(struct ds_info));
237 for(i=0; i<DE_ARRAYCOUNT(ds_info_arr); i++) {
238 if(ds_info_arr[i].recnum==recnum && ds_info_arr[i].dsnum==dsnum) {
239 *dsi = ds_info_arr[i]; // struct copy
240 return 1;
244 // Not found
245 dsi->recnum = recnum;
246 dsi->dsnum = dsnum;
247 dsi->dsname = "?";
248 return 0;
251 static int read_dflen(deark *c, dbuf *f, i64 pos,
252 i64 *dflen, i64 *bytes_consumed)
254 i64 x;
256 x = dbuf_getu16be(f, pos);
257 if(x<32768) { // "Standard DataSet" format
258 *dflen = x;
259 *bytes_consumed = 2;
261 else { // "Extended DataSet" format
262 i64 length_of_length;
263 i64 i;
265 length_of_length = x - 32768;
266 *dflen = 0;
267 *bytes_consumed = 2 + length_of_length;
269 for(i=0; i<length_of_length; i++) {
270 *dflen = ((*dflen)<<8) | dbuf_getbyte(f, pos+2+i);
272 // IPTC seems to support fields up to (2^262136)-1 bytes.
273 // We arbitrarily limit it (2^48)-1.
274 if((*dflen)>=0x1000000000000LL) {
275 de_err(c, "Bad or unsupported IPTC data field length");
276 return 0;
281 return 1;
284 static void handle_text(deark *c, lctx *d, const struct ds_info *dsi,
285 i64 pos, i64 len)
287 de_encoding encoding;
288 de_ucstring *s = NULL;
290 s = ucstring_create(c);
291 encoding = get_ds_encoding(c, d, dsi->recnum);
292 if(encoding==DE_ENCODING_UNKNOWN)
293 encoding = DE_ENCODING_ASCII;
294 dbuf_read_to_ucstring(c->infile, pos, len, s, 0, encoding);
295 de_dbg(c, "%s: \"%s\"", dsi->dsname, ucstring_getpsz_d(s));
296 ucstring_destroy(s);
299 static void handle_uint16(deark *c, lctx *d, const struct ds_info *dsi,
300 i64 pos, i64 len)
302 i64 x;
303 if(len!=2) return;
304 x = de_getu16be(pos);
305 de_dbg(c, "%s: %d", dsi->dsname, (int)x);
308 static int do_dataset(deark *c, lctx *d, i64 ds_idx, i64 pos1,
309 i64 *bytes_consumed)
311 u8 b;
312 u8 recnum, dsnum;
313 int retval = 0;
314 i64 pos = pos1;
315 i64 dflen;
316 i64 dflen_bytes_consumed;
317 struct ds_info dsi;
318 int ds_known;
320 *bytes_consumed = 0;
322 b = de_getbyte(pos);
323 if(b!=0x1c) {
324 if(b==0x00 && ds_idx>0) {
325 // Extraneous padding at the end of data?
326 if(pos == c->infile->len-1) {
327 de_dbg(c, "[ignoring extra byte at end of IPTC-IIM data]");
329 else {
330 de_warn(c, "Expected %"I64_FMT" bytes of IPTC-IIM data, only found %"I64_FMT,
331 c->infile->len, pos);
334 else {
335 de_err(c, "Bad IPTC tag marker (0x%02x) at %d", (int)b, (int)pos);
337 goto done;
339 pos++;
341 recnum = de_getbyte(pos++);
342 dsnum = de_getbyte(pos++);
344 ds_known = lookup_ds_info(recnum, dsnum, &dsi);
346 if(!read_dflen(c, c->infile, pos, &dflen, &dflen_bytes_consumed)) goto done;
347 pos += dflen_bytes_consumed;
349 de_dbg(c, "IPTC dataset %d:%02d (%s) dpos=%" I64_FMT " dlen=%" I64_FMT "",
350 (int)recnum, (int)dsnum, dsi.dsname, pos, dflen);
352 // Decode the value
353 de_dbg_indent(c, 1);
355 if(dsi.hfn) {
356 dsi.hfn(c, d, &dsi, pos, dflen);
358 else if(dsi.flags&0x1) {
359 handle_text(c, d, &dsi, pos, dflen);
361 else if(dsi.recnum==2 && !ds_known) {
362 // Unknown record-2 datasets often contain readable text.
363 handle_text(c, d, &dsi, pos, dflen);
365 pos += dflen;
367 de_dbg_indent(c, -1);
370 *bytes_consumed = pos - pos1;
371 retval = 1;
372 done:
373 return retval;
376 static void de_run_iptc(deark *c, de_module_params *mparams)
378 lctx *d = NULL;
379 i64 pos;
380 i64 bytes_consumed;
381 i64 ds_count;
383 d = de_malloc(c, sizeof(lctx));
384 d->charset = DE_ENCODING_UNKNOWN;
386 pos = 0;
387 ds_count = 0;
388 while(1) {
389 if(pos>=c->infile->len) break;
390 if(!do_dataset(c, d, ds_count, pos, &bytes_consumed)) break;
391 if(bytes_consumed<=0) break;
392 ds_count++;
393 pos += bytes_consumed;
396 de_free(c, d);
399 static int de_identify_iptc(deark *c)
401 u8 b;
403 // First byte of each dataset is 0x1c.
404 if(de_getbyte(0)!=0x1c) return 0;
406 // Check the record number. Record numbers 1-9 are known.
407 b = de_getbyte(1);
408 if(b<1 || b>15) return 0;
410 // This is not meant to imply that .iptc is an official file extension for
411 // IPTC data. It's just that it's sometimes used by Deark when extracting
412 // IPTC data to a file.
413 if(!de_input_file_has_ext(c, "iptc")) return 0;
415 return 60;
418 void de_module_iptc(deark *c, struct deark_module_info *mi)
420 mi->id = "iptc";
421 mi->desc = "IPTC-IIM metadata";
422 mi->run_fn = de_run_iptc;
423 mi->identify_fn = de_identify_iptc;