Cleanup child indexables in beagle-extract-content. Print timestamp with timezone...
[beagle.git] / glue / wv1-glue.c
blob5e3cd2659736b66e736a74a4bc8deb822f42d987
1 /*
2 * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
3 * Microsoft Word documents).
5 * Copyright (C) 2004 Novell, Inc.
7 * Author: Veerapuram Varadhan <vvaradhan@novell.com>
8 * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
13 * Permission is hereby granted, free of charge, to any person obtaining a
14 * copy of this software and associated documentation files (the "Software"),
15 * to deal in the Software without restriction, including without limitation
16 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 * and/or sell copies of the Software, and to permit persons to whom the
18 * Software is furnished to do so, subject to the following conditions:
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 * DEALINGS IN THE SOFTWARE.
32 #include <wv.h>
33 #include <string.h>
35 /* Number of structural-break'ed text-chunks to hold
36 * in the text/hot pools, before sending them for
37 * indexing. Increasing the number will give good
38 * performance w.r.t for indexing, however, may take
39 * large-chunk-of-memory to hold that much data and
40 * depends on the length of each structurally-broken
41 * lines.
43 #define BUFFERED_STRUCT_BREAK 12
46 /* Callback to Handle "text" (or words) extracted out of
47 * M$ Word documents
49 * text: Holds the extracted text/words.
51 * hotText: Identifies the attributes of the text.
52 * (bold, italic, underline, superscript, subscript)
55 typedef void (* wvTextHandlerCallback) (U8* text, int len,
56 U8* hotText, int hotLen,
57 U8 needStructBrk);
59 typedef struct _UserData {
60 /* formatting variables */
62 int cFontSize;
63 int cCol;
65 /* boolean formats */
66 int bIsBold:1;
67 int bIsItalic:1;
68 int bIsStrike:1;
69 int bIsUl:1;
70 int bIsSup:1;
71 int bIsSub:1;
72 int bIsSplStyle:1;
73 int bIgnore:1;
75 /* beagle specifc formats */
76 U8 bIsHot;
78 /* beagle specifc formats - for partially formatted
79 * texts.
81 U8 bWasHot:1;
84 specifies end of para, used to send data to managed code
86 int bParaEnd:1;
88 /* buffer to hold text */
89 GString* txtWord;
91 /* buffer to hold hot-pool-text */
92 GString* txtHotPool;
94 /* buffer to hold normal-pool-text */
95 GString* txtPool;
97 /* hold number of "structural breaks" encountered
98 * since last-update-to-filter.
100 short structBrkCount;
102 wvTextHandlerCallback WordHandler;
104 } UserData;
108 * append_char: fills the txtWord buffer with the character 'ch'
109 * converted to UTF8 encoding. Calls the "WordHandler" for every
110 * word/line/end of a paragraph or for every 1023 characters,
111 * whichever comes first.
113 * ud : carries the UserData filled-in appropriately to hold the
114 * character (text) attributes.
116 * ch : unicode character
120 void
121 append_char (UserData * ud, U16 ch)
123 gchar tmpBuf[64];
124 int len = 0;
125 U8 bNeedStructBrk = 0;
127 if (ud->bIgnore)
128 return;
130 switch (ch) {
131 case 0x00: /* End of Document */
132 bNeedStructBrk = 1;
133 break;
135 case 0x0B: /* hard line break */
136 case 0x0D: /* paragraph end */
137 case 0x0C:
138 case '\n': /* new-line */
139 bNeedStructBrk = 1;
140 ch = 0x0A;
141 break;
143 case 0x20: /* space */
144 g_string_append_c (ud->txtWord, ch);
145 break;
147 default:
148 len = g_unichar_to_utf8 (ch, tmpBuf);
149 /* FIXME: This is not good, pretty hacky code
150 * to get rid of unwanted characters, especially
151 * some graphic symbols used in a document.
152 * Ex: a tick mark, a smiley blah blah blah...
153 * in a much sane way without blocking
154 * printable-non-iso characters ;)
157 int i;
158 for (i = 0; i < len; i++)
159 if (tmpBuf[i] > 0)
160 g_string_append_c (ud->txtWord, tmpBuf[i]);
162 g_string_append_len (ud->txtWord, tmpBuf, len);
163 break;
166 if (ch == 0x00 || ch == 0x20 || ch == 0x0A) {
167 if (ud->bWasHot)
168 g_string_append_len (ud->txtHotPool, ud->txtWord->str, ud->txtWord->len);
171 printf ("TxtWord: %s, len: %d\n", ud->txtWord->str, ud->txtWord->len);
172 printf ("TxtPool: %s, len: %d\n", ud->txtPool->str, ud->txtPool->len);
173 printf ("HotTxtPool: %s, len: %d\n", ud->txtHotPool->str, ud->txtHotPool->len);
176 g_string_append_len (ud->txtPool, ud->txtWord->str, ud->txtWord->len);
177 if (bNeedStructBrk) {
178 g_string_append_c (ud->txtPool, '\n');
179 g_string_append_c (ud->txtHotPool, ' ');
180 ud->structBrkCount++;
183 if (ud->structBrkCount >= BUFFERED_STRUCT_BREAK ||
184 ud->bParaEnd) {
185 (*(ud->WordHandler))(ud->txtPool->str, ud->txtPool->len,
186 ud->txtHotPool->str, ud->txtHotPool->len, bNeedStructBrk);
188 g_string_erase () can be used here to erase
189 the previous content, however, using this
190 call will free the "erased-content-memory"
191 and thereby causing memory fragmentation for
192 every time we transfer data from unmanaged
193 to managed code. Setting "len" to 0 results
194 in the same way g_string_erase () does, but
195 doesn't do memory-[de/re]allocation
199 ch == 0x00 refers to EOD. Do not reset len to
200 zero, we have to free the gstrings.
202 if (ch != 0x00) {
203 ud->txtPool->len = 0;
204 ud->txtHotPool->len = 0;
205 ud->structBrkCount = 0;
208 if (ch != 0x00)
209 ud->txtWord->len = 0;
210 ud->bWasHot = 0;
215 * fill_UserData: fills the UserData structure from the
216 * CHP structure that represents the Character Property
217 * Information like bold, italic, striked, underlined,
218 * superscript, subscript, fontsize, color, fontface etc.
221 void
222 fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
224 if (!chp || !ud)
225 return;
227 ud->cCol = 0;
228 if (chp->ico)
229 ud->cCol = chp->ico - 1;
231 ud->cFontSize = chp->hps;
232 ud->bIsBold = (chp->fBold);
233 ud->bIsItalic = (chp->fItalic);
234 ud->bIsUl = (chp->kul);
235 ud->bIsStrike = (chp->fStrike);
236 ud->bIsSup = (chp->iss == 1);
237 ud->bIsSub = (chp->iss == 2);
239 if ((ud->bIsBold
240 || ud->bIsItalic
241 || ud->bIsUl
242 || ud->bIsSup
243 || ud->bIsSub
244 || ud->bIsSplStyle) &&
245 (!ud->bIgnore))
246 ud->bIsHot = 1;
247 else
248 ud->bIsHot = 0;
251 /* This is a callback that handles the individual
252 * character that are extracted from M$ word file.
254 static int
255 charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
257 /* convert incoming character to unicode */
258 if (chartype) {
259 eachchar = wvHandleCodePage (eachchar, lid);
262 /* take care of any oddities in Microsoft's character "encoding" */
263 /* TODO: does the above code page handler take care of these? */
264 if (chartype == 1 && eachchar == 146)
265 eachchar = 39; /* apostrophe */
267 switch (eachchar)
269 case 14: /* column break */
270 break;
272 case 19: /* field begin */
273 /* flush current text buffer */
274 ps->fieldstate++;
275 ps->fieldmiddle = 0;
276 return 0;
277 case 20: /* field separator */
278 ps->fieldmiddle = 1;
279 return 0;
280 case 21: /* field end */
281 ps->fieldstate--;
282 ps->fieldmiddle = 0;
283 return 0;
284 case 7: /* Cell/Row mark, end of a cell/row*/
285 eachchar = 0x20;
286 break;
287 default:
288 break;
291 if (eachchar == 0x14)
292 return 0;
294 /* To handle partially-formatted-texts, Bug#157100,
295 * which is applicable to all word-processor-generated
296 * documents.
298 * ud->bIsHot is updated for every CHARPROPBEGIN element
299 * ud->bWasHot is updated on reading every *word*.
301 UserData *ud = (UserData *) ps->userData;
302 if (!ud->bWasHot)
303 ud->bWasHot = ud->bIsHot;
305 append_char (ps->userData, eachchar);
306 return 0;
309 /* This is a callback that handles the special
310 * character that are specific to M$ word file.
312 static int
313 specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
315 Blip blip;
316 wvStream *fil;
317 long pos;
318 FSPA *fspa;
319 PICF picf;
320 FDOA *fdoa;
322 switch (eachchar)
324 case 19: /* field begin */
325 ps->fieldstate++;
326 ps->fieldmiddle = 0;
327 return 0;
328 case 20: /* field separator */
329 if (achp->fOle2)
331 /* printf ("Field has an embedded OLE2 object\n"); */
333 ps->fieldmiddle = 1;
334 return 0;
335 case 21: /* field end */
336 ps->fieldstate--;
337 ps->fieldmiddle = 0;
338 return 0;
339 case 7: /* Cell/Row mark, end of a cell/row */
340 append_char (ps->userData, 0x20);
341 break;
342 default:
343 break;
346 if (ps->fieldstate)
348 if (eachchar == 0x13 || eachchar == 0x14)
349 return 0;
352 return 0;
355 /* This is a callback that handles the individual
356 * elements that are marked by libwv1.
359 static int
360 eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
362 /* some word structures */
363 PAP *apap;
364 CHP *achp;
365 SEP *asep;
366 int iRes;
368 UserData *ud = (UserData *) ps->userData;
370 switch (tag)
372 case PARABEGIN:
373 apap = (PAP *)props;
374 switch (ps->stsh.std[apap->istd].sti) {
375 case 29: /* Footnote Text */
376 case 30: /* Annotation text */
377 case 31: /* Header */
378 case 32: /* Footer */
379 case 33: /* Index Heading */
380 case 34: /* Caption */
381 case 43: /* Endnote Text */
382 case 62: /* Title */
383 case 74: /* Sub title */
384 ud->bIsSplStyle = 1;
385 break;
386 default:
387 ud->bIsSplStyle = 0;
388 break;
390 ud->bParaEnd = 0;
391 break;
393 case SECTIONEND:
394 append_char (ud, '\n');
395 break;
397 case PARAEND: /* pretty much nothing */
398 ud->bIsSplStyle = 0;
399 ud->bParaEnd = 1;
400 append_char (ud, '\n');
401 break;
403 case CHARPROPBEGIN:
404 achp = (CHP *) props;
405 /* switch (ps->stsh.std[achp->istd].sti) {
406 case 38:
407 case 39:
408 case 40:
409 case 41:
410 case 42:
411 ud->bIgnore = 1;
412 break;
413 default:
414 ud->bIgnore = 0;
415 break;
418 fill_UserData (ud, achp, ps);
419 break;
422 /* Do not call fill_UserData, as it resets the
423 * *Hot* flag in the ud structure.
425 case CHARPROPEND:
426 achp = (CHP *) props;
427 /*fill_UserData (ud, achp, ps);*/
428 break;
430 default:
431 break;
434 return 0;
437 /* This is a callback that handles the document
438 * level tags that are marked by libwv1.
441 static int
442 docProc (wvParseStruct * ps, wvTag tag)
444 UserData *ud = (UserData *) ps->userData;
446 switch (tag)
448 case DOCEND:
449 /* flush the text/hot pools at the EOD */
450 ud->structBrkCount = BUFFERED_STRUCT_BREAK;
451 append_char (ps->userData, 0x00);
453 break;
455 default:
456 break;
459 return 0;
463 * wv1_glue_init_doc_parsing: Initiates the document parsing
464 * procedure. Sets up all the required handlers and the parser.
466 * fname: Name of the file to parse. (essentially a M$ word file)
468 * wvTextHandlerCallback: The callback routine that will be called
469 * on extraction of each word.
471 * Return: 0 -> success
472 * -1 -> failure.
476 wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
478 FILE *input;
479 int ret = 0;
481 wvParseStruct ps;
482 char *dir = NULL;
484 UserData ud;
486 input = fopen (fname, "rb");
487 if (!input)
488 return -1;
489 fclose (input);
491 ret = wvInitParser (&ps, fname);
492 if (ret & 0x8000)
493 ret = -2;
494 else if (ret)
495 ret = -3;
497 if (ret) {
498 wvOLEFree (&ps);
499 return ret;
502 ps.filename = fname;
503 ps.dir = dir;
505 /* set to 0 */
506 memset (&ud, 0, sizeof (UserData));
507 ud.WordHandler = callback;
508 ud.txtWord = g_string_sized_new (32);
509 ud.txtHotPool = g_string_sized_new (1024);
510 ud.txtPool = g_string_sized_new (1024);
511 ps.userData = &ud;
513 wvSetElementHandler (&ps, eleProc);
514 wvSetDocumentHandler (&ps, docProc);
515 wvSetCharHandler (&ps, charProc);
516 wvSetSpecialCharHandler (&ps, specCharProc);
518 wvText (&ps);
520 /* free userdata memory */
521 g_string_free (ud.txtWord, TRUE);
523 /* free text pool memory */
524 g_string_free (ud.txtPool, TRUE);
526 /* free hot text pool memory */
527 g_string_free (ud.txtHotPool, TRUE);
529 /* free associated memory */
530 wvOLEFree (&ps);
532 ud.txtPool = NULL;
533 ud.txtWord = NULL;
534 ud.txtHotPool = NULL;
536 return 0;
540 * wv1_init (): Initialize the wv1 library
541 * NOTE: Do not call this more than once for an application.
545 wv1_init ()
547 return (wvInit());