fnt: Improved error handling, etc.
[deark.git] / modules / wri.c
blob710e156508cb61c24dcd0efddb66bd74f818a3ba
1 // This file is part of Deark.
2 // Copyright (C) 2018 Jason Summers
3 // See the file COPYING for terms of use.
5 // Microsoft Windows Write (.wri) format
7 #include <deark-config.h>
8 #include <deark-private.h>
9 DE_DECLARE_MODULE(de_module_wri);
11 #define WRI_STG_METAFILE 0x88
12 #define WRI_STG_BITMAP 0xe3
13 #define WRI_STG_OLE 0xe4
15 struct picctx_struct {
16 unsigned int mm; // WRI_STG_*
17 i64 cbHeader;
18 i64 cbSize;
19 unsigned int ole_objectType;
20 i64 ole_dwDataSize;
23 struct text_styles_struct {
24 u8 tab_style;
27 struct para_info {
28 i64 thisparapos, thisparalen;
29 i64 bfprop_offset; // file-level offset
30 u8 papflags;
31 u8 justification;
33 int in_para;
34 int xpos; // Current length of this line in the source code
35 int has_content; // Have we emitted a non-space char in this paragraph?
36 int space_count;
38 int in_span;
39 struct text_styles_struct text_styles_wanted; // Styles for the next char to be emitted
40 struct text_styles_struct text_styles_current; // Effective current styles
43 typedef struct localctx_struct {
44 int extract_text;
45 int extract_ole;
46 int input_encoding;
47 i64 fcMac;
48 i64 pnChar;
49 i64 pnChar_offs;
50 i64 pnPara;
51 i64 pnPara_offs;
52 i64 pnPara_npages;
53 i64 pnFntb, pnSep, pnSetb, pnPgtb, pnFfntb;
54 i64 pnMac;
55 dbuf *html_outf;
56 de_ucstring *tmpstr;
57 struct de_encconv_state es;
58 } lctx;
60 static void do_emit_raw_sz(deark *c, lctx *d, struct para_info *pinfo, const char *sz);
61 static void do_emit_ucstring(deark *c, lctx *d, struct para_info *pinfo,
62 de_ucstring *s);
63 static void end_para(deark *c, lctx *d, struct para_info *pinfo);
65 static void default_text_styles(struct text_styles_struct *ts)
67 de_zeromem(ts, sizeof(struct text_styles_struct));
70 static int text_styles_differ(const struct text_styles_struct *ts1,
71 const struct text_styles_struct *ts2)
73 if(ts1->tab_style != ts2->tab_style) return 1;
74 return 0;
77 static int do_header(deark *c, lctx *d, i64 pos)
79 de_dbg(c, "header at %d", (int)pos);
80 de_dbg_indent(c, 1);
82 d->fcMac = de_getu32le(pos+7*2);
83 de_dbg(c, "fcMac: %d", (int)d->fcMac);
84 d->pnChar = (d->fcMac + 127) / 128;
85 d->pnChar_offs = d->pnChar * 128;
86 de_dbg(c, "pnChar: page %d (offset %d)", (int)d->pnChar, (int)d->pnChar_offs);
88 d->pnPara = de_getu16le(pos+9*2);
89 d->pnPara_offs = d->pnPara * 128;
90 de_dbg(c, "pnPara: page %d (offset %d)", (int)d->pnPara, (int)d->pnPara_offs);
92 d->pnFntb = de_getu16le(pos+10*2);
93 de_dbg(c, "pnFntb: page %d", (int)d->pnFntb);
95 d->pnSep = de_getu16le(pos+11*2);
96 de_dbg(c, "pnSep: page %d", (int)d->pnSep);
98 d->pnSetb = de_getu16le(pos+12*2);
99 de_dbg(c, "pnSetb: page %d", (int)d->pnSetb);
101 d->pnPgtb = de_getu16le(pos+13*2);
102 de_dbg(c, "pnPgtb: page %d", (int)d->pnPgtb);
104 d->pnFfntb = de_getu16le(pos+14*2);
105 de_dbg(c, "pnFfntb: page %d", (int)d->pnFfntb);
107 d->pnMac = de_getu16le(pos+48*2);
108 de_dbg(c, "pnMac: %d pages", (int)d->pnMac);
110 d->pnPara_npages = d->pnFntb - d->pnPara;
112 de_dbg_indent(c, -1);
113 return 1;
116 static const char *get_objecttype1_name(unsigned int t)
118 const char *name;
119 switch(t) {
120 case 1: name="static"; break;
121 case 2: name="embedded"; break;
122 case 3: name="link"; break;
123 default: name="?"; break;
125 return name;
128 // Read the usually-40-byte picture header.
129 // The header has 3 variants (metafile, bitmap, OLE), which have some common
130 // fields, and some different fields. So, this function is kind of ugly.
131 static void do_picture_header(deark *c, lctx *d, struct para_info *pinfo,
132 struct picctx_struct *picctx)
134 i64 pos1 = pinfo->thisparapos;
135 i64 pos = pos1;
136 i64 n1, n2;
138 // (The initial "mm" / "storage type" field has already been read.)
140 if(picctx->mm==WRI_STG_METAFILE) {
141 pos = pos1+2;
142 n1 = de_getu16le_p(&pos);
143 n2 = de_getu16le_p(&pos);
144 de_dbg(c, "xExt,yExt: %d"DE_CHAR_TIMES"%d twips", (int)n1, (int)n2);
147 if(picctx->mm==WRI_STG_OLE) {
148 // This field seems important, but we don't use it, because the OLE
149 // FormatID field is sufficient.
150 picctx->ole_objectType = (unsigned int)de_getu16le(pos1+6);
151 de_dbg(c, "objectType: %u (%s)", picctx->ole_objectType,
152 get_objecttype1_name(picctx->ole_objectType));
155 pos = pos1+8;
156 n1 = de_getu16le_p(&pos);
157 de_dbg(c, "dxaOffset: %d twips", (int)n1);
158 n1 = de_getu16le_p(&pos);
159 n2 = de_getu16le_p(&pos);
160 de_dbg(c, "dxaSize,dyaSize: %d"DE_CHAR_TIMES"%d twips", (int)n1, (int)n2);
162 if(picctx->mm==WRI_STG_BITMAP) {
163 de_dbg(c, "[DDB header at %"I64_FMT"]", pos1+16);
166 if(picctx->mm==WRI_STG_OLE) {
167 picctx->ole_dwDataSize = de_getu32le(pos1+16);
168 de_dbg(c, "dwDataSize: %d", (int)picctx->ole_dwDataSize);
169 n1 = de_getu32le(pos1+24);
170 de_dbg(c, "dwObjNum: 0x%08x", (unsigned int)n1);
173 picctx->cbHeader = de_getu16le(pos1+30);
174 de_dbg(c, "header size: %d", (int)picctx->cbHeader);
176 if(picctx->mm==WRI_STG_METAFILE || picctx->mm==WRI_STG_BITMAP) {
177 picctx->cbSize = de_getu32le(pos1+32);
178 de_dbg(c, "data size: %d", (int)picctx->cbSize);
181 pos = pos1+36;
182 n1 = de_getu16le_p(&pos);
183 n2 = de_getu16le_p(&pos);
184 de_dbg(c, "scaling factor x,y: %d,%d", (int)n1, (int)n2);
187 static void do_picture_metafile(deark *c, lctx *d, struct para_info *pinfo,
188 struct picctx_struct *picctx)
190 i64 pos = pinfo->thisparapos;
192 if(picctx->cbHeader+picctx->cbSize > pinfo->thisparalen) goto done;
193 de_dbg(c, "metafile data at %"I64_FMT, pos+picctx->cbHeader);
194 dbuf_create_file_from_slice(c->infile, pos+picctx->cbHeader, picctx->cbSize,
195 "wmf", NULL, 0);
196 done:
200 static void do_picture_bitmap(deark *c, lctx *d, struct para_info *pinfo,
201 struct picctx_struct *picctx)
203 i64 hdrpos, bitspos;
204 i64 bitssize;
205 dbuf *tmpf = NULL;
207 if(picctx->cbHeader + picctx->cbSize > pinfo->thisparalen) goto done;
208 hdrpos = pinfo->thisparapos + 16;
209 bitspos = pinfo->thisparapos + picctx->cbHeader;
210 bitssize = picctx->cbSize;
211 de_dbg(c, "processing DDB, header at %"I64_FMT", pixels at %"I64_FMT,
212 hdrpos, bitspos);
214 // Most commonly, the DDB bits immediately follow the header. But in this
215 // format, they are separated, with some non-DDB data in between.
216 // We'll construct a temporary DDB, in which they are contiguous, to pass
217 // to the ddb module.
218 tmpf = dbuf_create_membuf(c, 14+bitssize, 0);
219 dbuf_copy(c->infile, hdrpos, 14, tmpf);
220 dbuf_copy(c->infile, bitspos, bitssize, tmpf);
222 de_dbg_indent(c, 1);
223 de_run_module_by_id_on_slice2(c, "ddb", "N", tmpf, 0, tmpf->len);
224 de_dbg_indent(c, -1);
226 done:
227 dbuf_close(tmpf);
230 static const char *get_picture_storage_type_name(unsigned int t)
232 const char *name;
233 switch(t) {
234 case WRI_STG_METAFILE: name="metafile"; break;
235 case WRI_STG_BITMAP: name="bitmap"; break;
236 case WRI_STG_OLE: name="OLE object"; break;
237 default: name="?"; break;
239 return name;
242 static void do_picture_ole(deark *c, lctx *d, struct para_info *pinfo,
243 struct picctx_struct *picctx)
245 i64 pos = pinfo->thisparapos;
246 i64 ole_len;
247 de_module_params *mparams = NULL;
249 pos += picctx->cbHeader;
251 mparams = de_malloc(c, sizeof(de_module_params));
252 mparams->in_params.input_encoding = d->input_encoding;
254 ole_len = de_min_int(picctx->ole_dwDataSize, pinfo->thisparapos+pinfo->thisparalen-pos);
255 de_dbg(c, "OLE1 data at %"I64_FMT", len=%"I64_FMT, pos, ole_len);
256 de_dbg_indent(c, 1);
257 de_run_module_by_id_on_slice(c, "ole1", mparams, c->infile, pos, ole_len);
258 de_dbg_indent(c, -1);
260 de_free(c, mparams);
263 static int get_next_output_file_id(deark *c)
265 return c->file_count;
268 static void do_picture(deark *c, lctx *d, struct para_info *pinfo)
270 int orig_file_count, curr_file_count;
271 struct picctx_struct *picctx = NULL;
272 i64 pos = pinfo->thisparapos;
274 picctx = de_malloc(c, sizeof(struct picctx_struct));
275 if(pinfo->thisparalen<2) goto done;
276 picctx->mm = (unsigned int)de_getu16le(pos);
277 de_dbg(c, "picture storage type: 0x%04x (%s)", picctx->mm,
278 get_picture_storage_type_name(picctx->mm));
280 orig_file_count = get_next_output_file_id(c);
282 do_picture_header(c, d, pinfo, picctx);
284 switch(picctx->mm) {
285 case WRI_STG_METAFILE:
286 do_picture_metafile(c, d, pinfo, picctx);
287 break;
288 case WRI_STG_BITMAP:
289 do_picture_bitmap(c, d, pinfo, picctx);
290 break;
291 case WRI_STG_OLE:
292 do_picture_ole(c, d, pinfo, picctx);
293 break;
294 default:
295 de_err(c, "Picture storage type 0x%04x not supported", picctx->mm);
298 if(d->html_outf) {
299 // We want to include the image file ID numbers in the HTML document,
300 // so that the user can figure out which image goes where.
301 // To deduce the ID number, we watch the global file ID counter.
302 // It's totally a hack, but unfortunately our high level functions that
303 // create an output file (e.g. de_convert_and_write_image_bilevel) do
304 // not have a way return the ID number of the file they created. It
305 // would be a lot of trouble to create such a mechanism.
307 do_emit_raw_sz(c, d, pinfo, "<p class=r>");
308 pinfo->in_para = 1;
309 ucstring_empty(d->tmpstr);
310 ucstring_append_sz(d->tmpstr, "object", DE_ENCODING_LATIN1);
312 curr_file_count = get_next_output_file_id(c);
313 if(curr_file_count == orig_file_count+1) {
314 ucstring_printf(d->tmpstr, DE_ENCODING_LATIN1, " %d", orig_file_count);
316 else if(curr_file_count == orig_file_count) {
317 ucstring_append_sz(d->tmpstr, " (not extracted)", DE_ENCODING_LATIN1);
319 else {
320 ucstring_printf(d->tmpstr, DE_ENCODING_UTF8, "s %d" "\xe2\x80\x93" "%d",
321 orig_file_count, curr_file_count-1);
323 do_emit_ucstring(c, d, pinfo, d->tmpstr);
324 end_para(c, d, pinfo);
327 done:
328 de_free(c, picctx);
331 static void ensure_in_para(deark *c, lctx *d, struct para_info *pinfo)
333 if(pinfo->in_para) return;
334 do_emit_raw_sz(c, d, pinfo, "<p");
335 switch(pinfo->justification) {
336 case 1: do_emit_raw_sz(c, d, pinfo, " class=tc"); break;
337 case 2: do_emit_raw_sz(c, d, pinfo, " class=tr"); break;
338 case 3: do_emit_raw_sz(c, d, pinfo, " class=tj"); break;
340 do_emit_raw_sz(c, d, pinfo, ">");
341 pinfo->in_para = 1;
344 // Emit a data codepoint, inside a paragraph.
345 static void do_emit_codepoint(deark *c, lctx *d, struct para_info *pinfo, i32 outcp)
347 int styles_changed;
349 if(!pinfo->in_para) {
350 ensure_in_para(c, d, pinfo);
353 styles_changed = text_styles_differ(&pinfo->text_styles_current, &pinfo->text_styles_wanted);
355 if(pinfo->in_span && styles_changed) {
356 do_emit_raw_sz(c, d, pinfo, "</span>");
357 pinfo->in_span = 0;
359 if(styles_changed) {
360 if(pinfo->text_styles_wanted.tab_style) {
361 do_emit_raw_sz(c, d, pinfo, "<span class=c>");
362 pinfo->in_span = 1;
364 pinfo->text_styles_current = pinfo->text_styles_wanted; // struct copy
367 de_write_codepoint_to_html(c, d->html_outf, outcp);
369 // FIXME: We'd like to know how many characters (not bytes) were written,
370 // but we don't currently have a good way to do that in the case where the
371 // codepoint was written as an HTML entity.
372 pinfo->xpos++;
374 if(outcp!=32) {
375 pinfo->has_content = 1;
379 // Same as calling do_emit_codepoint() on each character.
380 static void do_emit_ucstring(deark *c, lctx *d, struct para_info *pinfo,
381 de_ucstring *s)
383 i64 k;
385 if(!s) return;
386 for(k=0; k<s->len; k++) {
387 do_emit_codepoint(c, d, pinfo, s->str[k]);
391 // Emit a raw string. Does not force a paragraph to be open.
392 // Updates pinfo->xpos (assumes 1 byte per char).
393 // For xpos, handles the case where sz ends with a newline, but does not
394 // handle internal newlines.
395 static void do_emit_raw_sz(deark *c, lctx *d, struct para_info *pinfo, const char *sz)
397 size_t sz_len = de_strlen(sz);
398 if(sz_len<1) return;
399 dbuf_write(d->html_outf, (const u8*)sz, (i64)sz_len);
400 if(sz[sz_len-1]=='\n') {
401 pinfo->xpos = 0;
403 else {
404 pinfo->xpos += (int)sz_len;
408 static void end_para(deark *c, lctx *d, struct para_info *pinfo)
410 if(!pinfo->in_para) return;
412 if(pinfo->in_span) {
413 do_emit_raw_sz(c, d, pinfo, "</span>");
414 pinfo->in_span = 0;
417 if(!pinfo->has_content) {
418 // No empty paragraphs allowed. HTML will collapse them, but Write does not.
419 do_emit_codepoint(c, d, pinfo, 0xa0);
421 do_emit_raw_sz(c, d, pinfo, "</p>\n");
422 pinfo->in_para = 0;
423 default_text_styles(&pinfo->text_styles_current);
426 static void do_text_paragraph(deark *c, lctx *d, struct para_info *pinfo)
428 i64 i, k;
430 if(!d->html_outf) return;
432 if((pinfo->papflags & 0x06)!=0) {
433 // TODO: Decode headers and footers somehow.
434 do_emit_raw_sz(c, d, pinfo, "<p class=r>");
435 do_emit_raw_sz(c, d, pinfo, (pinfo->papflags&0x01)?"footer":"header");
436 do_emit_raw_sz(c, d, pinfo, " definition</p>\n");
437 return;
440 pinfo->in_para = 0;
441 pinfo->xpos = 0;
442 pinfo->space_count = 0;
443 pinfo->has_content = 0;
444 pinfo->in_span = 0;
445 default_text_styles(&pinfo->text_styles_wanted);
446 default_text_styles(&pinfo->text_styles_current);
448 for(i=0; i<pinfo->thisparalen; i++) {
449 u8 incp;
451 incp = de_getbyte(pinfo->thisparapos+i);
452 if(incp==0x0d && i<pinfo->thisparalen-1) {
453 if(de_getbyte(pinfo->thisparapos+i+1)==0x0a) {
454 // Found CR-LF combo
455 i++;
456 ensure_in_para(c, d, pinfo);
457 end_para(c, d, pinfo);
458 continue;
462 if(incp!=32 && pinfo->space_count>0) {
463 int nonbreaking_count, breaking_count;
465 if(!pinfo->in_para && pinfo->space_count==1) {
466 // If the paragraph starts with a single space, make it nonbreaking.
467 nonbreaking_count = 1;
468 breaking_count = 0;
470 else {
471 // Else make all spaces but the last one nonbreaking
472 nonbreaking_count = pinfo->space_count-1;
473 breaking_count = 1;
476 ensure_in_para(c, d, pinfo);
478 for(k=0; k<nonbreaking_count; k++) {
479 do_emit_codepoint(c, d, pinfo, 0xa0);
482 if(breaking_count>0) {
483 if(pinfo->xpos>70) {
484 // We don't do proper word wrapping of the HTML source, but
485 // maybe this is better than nothing.
486 do_emit_raw_sz(c, d, pinfo, "\n");
488 else {
489 do_emit_codepoint(c, d, pinfo, 32);
493 pinfo->space_count=0;
496 if(incp>=33) {
497 i32 outcp;
499 // TODO: Decide if we ought to support multi-code-unit encodings
500 // like UTF-8.
501 outcp = de_char_to_unicode_ex((i32)incp, &d->es);
502 do_emit_codepoint(c, d, pinfo, outcp);
504 else {
505 switch(incp) {
506 case 9: // tab
507 pinfo->text_styles_wanted.tab_style = 1;
508 do_emit_codepoint(c, d, pinfo, 0x2192);
509 pinfo->text_styles_wanted.tab_style = 0;
510 break;
511 case 10:
512 case 11:
513 ensure_in_para(c, d, pinfo);
514 do_emit_raw_sz(c, d, pinfo, "<br>\n");
515 pinfo->has_content = 1;
516 break;
517 case 12: // page break
518 end_para(c, d, pinfo);
519 do_emit_raw_sz(c, d, pinfo, "<hr>\n");
520 break;
521 case 31:
522 break;
523 case 32:
524 pinfo->space_count++;
525 break;
526 default:
527 do_emit_codepoint(c, d, pinfo, 0xfffd);
532 end_para(c, d, pinfo);
535 static void do_paragraph(deark *c, lctx *d, struct para_info *pinfo)
537 if(pinfo->papflags&0x10) {
538 de_dbg(c, "picture at %d, len=%d", (int)pinfo->thisparapos,
539 (int)pinfo->thisparalen);
540 de_dbg_indent(c, 1);
541 do_picture(c, d, pinfo);
542 de_dbg_indent(c, -1);
544 else {
545 de_dbg(c, "text paragraph at %d, len=%d", (int)pinfo->thisparapos,
546 (int)pinfo->thisparalen);
547 do_text_paragraph(c, d, pinfo);
551 static void do_para_fprop(deark *c, lctx *d, struct para_info *pinfo,
552 i64 bfprop, u8 is_dup)
554 i64 fprop_dlen = 0;
556 // bfprop is a pointer into the 123 bytes of data starting
557 // at pos+4. The maximum sensible value is at most 122.
558 if(bfprop<=122) {
559 // It appears that the length prefix does not include itself,
560 // contrary to what one source says.
561 fprop_dlen = (i64)de_getbyte(pinfo->bfprop_offset);
562 if(!is_dup) de_dbg(c, "fprop dlen: %d", (int)fprop_dlen);
565 if(fprop_dlen>=2) {
566 pinfo->justification = de_getbyte(pinfo->bfprop_offset + 1 + 1) & 0x03;
567 if(!is_dup && pinfo->justification!=0) {
568 de_dbg(c, "justification: %d", (int)pinfo->justification);
572 if(fprop_dlen>=17) {
573 pinfo->papflags = de_getbyte(pinfo->bfprop_offset + 1 + 16);
574 if(!is_dup) {
575 de_ucstring *flagstr = ucstring_create(c);
576 if(pinfo->papflags&0x06) {
577 ucstring_append_flags_item(flagstr, (pinfo->papflags&0x01)?"footer":"header");
578 ucstring_append_flags_item(flagstr, (pinfo->papflags&0x08)?"print on first page":
579 "do not print on first page");
581 if(pinfo->papflags&0x10) ucstring_append_flags_item(flagstr, "picture");
582 de_dbg(c, "paragraph flags: 0x%02x (%s)", (unsigned int)pinfo->papflags,
583 ucstring_getpsz(flagstr));
584 ucstring_destroy(flagstr);
589 static void do_para_info_page(deark *c, lctx *d, i64 pos)
591 i64 fcFirst;
592 i64 cfod;
593 i64 i;
594 i64 fod_array_startpos;
595 i64 prevtextpos;
596 u8 fprop_seen[128];
598 de_zeromem(fprop_seen, sizeof(fprop_seen));
599 de_dbg(c, "paragraph info page at %d", (int)pos);
600 de_dbg_indent(c, 1);
602 cfod = (i64)de_getbyte(pos+127);
603 de_dbg(c, "number of FODs on this page: %d", (int)cfod);
605 // There are up to 123 bytes available for the FOD array, and each FOD is
606 // 6 bytes. So I assume the maximum possible is 20.
607 if(cfod>20) cfod=20;
609 fcFirst = de_getu32le(pos);
610 de_dbg(c, "fcFirst: %d", (int)fcFirst);
612 fod_array_startpos = pos + 4;
614 prevtextpos = fcFirst;
616 for(i=0; i<cfod; i++) {
617 struct para_info *pinfo = NULL;
618 i64 fcLim_orig, fcLim_adj;
619 i64 bfprop;
620 i64 fodpos = fod_array_startpos + 6*i;
622 pinfo = de_malloc(c, sizeof(struct para_info));
624 de_dbg(c, "FOD[%d] at %d", (int)i, (int)fodpos);
625 de_dbg_indent(c, 1);
627 fcLim_orig = de_getu32le(fodpos);
628 fcLim_adj = fcLim_orig;
629 if(fcLim_adj > d->fcMac) fcLim_adj = d->fcMac;
630 pinfo->thisparapos = prevtextpos;
631 pinfo->thisparalen = fcLim_adj - prevtextpos;
632 de_dbg(c, "fcLim: %d (paragraph from %d to %d)", (int)fcLim_orig,
633 (int)pinfo->thisparapos, (int)(fcLim_adj-1));
634 prevtextpos = fcLim_adj;
636 bfprop = de_getu16le(fodpos+4);
637 if(bfprop==0xffff) {
638 de_dbg(c, "bfprop: %d (none)", (int)bfprop);
640 else {
641 pinfo->bfprop_offset = fod_array_startpos + bfprop;
643 de_dbg(c, "bfprop: %d (+ %d = %d)", (int)bfprop,
644 (int)fod_array_startpos, (int)pinfo->bfprop_offset);
646 de_dbg_indent(c, 1);
647 if(bfprop<128) {
648 if(fprop_seen[bfprop]) {
649 // An FPROP can be referenced multiple times. Only print the
650 // debug info for it once.
651 de_dbg(c, "[already decoded FPROP at %d on this paragraph info page]", (int)bfprop);
653 do_para_fprop(c, d, pinfo, bfprop, fprop_seen[bfprop]);
654 fprop_seen[bfprop] = 1;
656 de_dbg_indent(c, -1);
659 do_paragraph(c, d, pinfo);
661 de_free(c, pinfo);
662 pinfo = NULL;
663 de_dbg_indent(c, -1);
666 de_dbg_indent(c, -1);
669 static void do_para_info(deark *c, lctx *d)
671 i64 i;
673 if(d->pnPara_npages<1) return;
674 de_dbg(c, "paragraph info at %d, len=%d page(s)", (int)d->pnPara_offs, (int)d->pnPara_npages);
676 de_dbg_indent(c, 1);
677 for(i=0; i<d->pnPara_npages; i++) {
678 do_para_info_page(c, d, d->pnPara_offs + 128*i);
680 de_dbg_indent(c, -1);
683 static void do_html_begin(deark *c, lctx *d)
685 dbuf *f;
686 if(d->html_outf) return;
687 d->html_outf = dbuf_create_output_file(c, "html", NULL, 0);
688 dbuf_enable_wbuffer(d->html_outf);
689 f = d->html_outf;
690 if(c->write_bom && !c->ascii_html) dbuf_write_uchar_as_utf8(f, 0xfeff);
691 dbuf_puts(f, "<!DOCTYPE html>\n");
692 dbuf_puts(f, "<html>\n");
693 dbuf_puts(f, "<head>\n");
694 dbuf_printf(f, "<meta charset=\"%s\">\n", c->ascii_html?"US-ASCII":"UTF-8");
695 dbuf_puts(f, "<title></title>\n");
697 dbuf_puts(f, "<style type=\"text/css\">\n");
698 dbuf_puts(f, " body { color: #000; background-color: #fff }\n");
699 dbuf_puts(f, " p { margin-top: 0; margin-bottom: 0 }\n");
700 dbuf_puts(f, " .c { color: #ccc }\n"); // Visible control characters
702 // Replacement object
703 dbuf_puts(f, " .r { padding: 0.5ex; color: #800; background-color: #eee;\n");
704 dbuf_puts(f, " font-style: italic; border: 0.34ex dotted #800 }\n");
706 dbuf_puts(f, " .tc { text-align: center }\n");
707 dbuf_puts(f, " .tr { text-align: right }\n");
708 dbuf_puts(f, " .tj { text-align: justify }\n");
709 dbuf_puts(f, "</style>\n");
711 dbuf_puts(f, "</head>\n");
712 dbuf_puts(f, "<body>\n");
715 static void do_html_end(deark *c, lctx *d)
717 if(!d->html_outf) return;
718 dbuf_puts(d->html_outf, "</body>\n</html>\n");
719 dbuf_close(d->html_outf);
720 d->html_outf = NULL;
723 static void de_run_wri(deark *c, de_module_params *mparams)
725 lctx *d = NULL;
726 i64 pos;
728 d = de_malloc(c, sizeof(lctx));
730 d->input_encoding = de_get_input_encoding(c, NULL, DE_ENCODING_WINDOWS1252);
731 d->extract_text = de_get_ext_option_bool(c, "wri:extracttext", 1);
732 d->extract_ole = de_get_ext_option_bool(c, "wri:extractole",
733 (c->extract_level>=2)?1:0);
735 de_encconv_init(&d->es, d->input_encoding);
737 d->tmpstr = ucstring_create(c);
739 pos = 0;
740 if(!do_header(c, d, pos)) goto done;
741 if(d->extract_text) {
742 do_html_begin(c, d);
745 do_para_info(c, d);
747 done:
748 if(d) {
749 do_html_end(c, d);
750 ucstring_destroy(d->tmpstr);
751 de_free(c, d);
755 static int de_identify_wri(deark *c)
757 u8 buf[6];
758 de_read(buf, 0, 6);
760 if((buf[0]==0x31 || buf[0]==0x32) &&
761 !de_memcmp(&buf[1], "\xbe\x00\x00\x00\xab", 5))
763 i64 pnMac;
764 pnMac = de_getu16le(48*2);
765 if(pnMac==0) return 0; // Apparently MSWord, not Write
766 return 100;
768 return 0;
771 static void de_help_wri(deark *c)
773 de_msg(c, "-opt wri:extracttext=0 : Do not extract text");
774 de_msg(c, "-opt wri:extractole : Extract unidentified OLE objects");
777 void de_module_wri(deark *c, struct deark_module_info *mi)
779 mi->id = "wri";
780 mi->desc = "Microsoft Write";
781 mi->run_fn = de_run_wri;
782 mi->identify_fn = de_identify_wri;
783 mi->help_fn = de_help_wri;