Thumbnail file hits. Based on a patch from D Bera
[beagle.git] / glue / wv1-glue.c
blob9bf0fb9266fd67f439bb370e0a1a6965787ce2cc
1 /*
2 * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
3 * Microsoft Word documents).
5 * Copyright (C) 2004 Novell, Inc.
7 * Author: Veerapuram Varadhan <vvaradhan@novell.com>
8 * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
13 * Permission is hereby granted, free of charge, to any person obtaining a
14 * copy of this software and associated documentation files (the "Software"),
15 * to deal in the Software without restriction, including without limitation
16 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 * and/or sell copies of the Software, and to permit persons to whom the
18 * Software is furnished to do so, subject to the following conditions:
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 * DEALINGS IN THE SOFTWARE.
32 #include <wv.h>
33 #include <string.h>
35 /* Number of structural-break'ed text-chunks to hold
36 * in the text/hot pools, before sending them for
37 * indexing. Increasing the number will give good
38 * performance w.r.t for indexing, however, may take
39 * large-chunk-of-memory to hold that much data and
40 * depends on the length of each structurally-broken
41 * lines.
43 #define BUFFERED_STRUCT_BREAK 12
46 /* Callback to Handle "text" (or words) extracted out of
47 * M$ Word documents
49 * text: Holds the extracted text/words.
51 * hotText: Identifies the attributes of the text.
52 * (bold, italic, underline, superscript, subscript)
55 typedef void (* wvTextHandlerCallback) (U8* text, int len,
56 U8* hotText, int hotLen,
57 U8 needStructBrk);
59 typedef struct _UserData {
60 /* formatting variables */
62 int cFontSize;
63 int cCol;
65 /* boolean formats */
66 int bIsBold:1;
67 int bIsItalic:1;
68 int bIsStrike:1;
69 int bIsUl:1;
70 int bIsSup:1;
71 int bIsSub:1;
72 int bIsSplStyle:1;
73 int bIgnore:1;
75 /* beagle specifc formats */
76 U8 bIsHot;
78 /* beagle specifc formats - for partially formatted
79 * texts.
81 U8 bWasHot;
83 /* buffer to hold text */
84 GString* txtWord;
86 /* buffer to hold hot-pool-text */
87 GString* txtHotPool;
89 /* buffer to hold normal-pool-text */
90 GString* txtPool;
92 /* hold number of "structural breaks" encountered
93 * since last-update-to-filter.
95 short structBrkCount;
97 wvTextHandlerCallback WordHandler;
99 } UserData;
103 * append_char: fills the txtWord buffer with the character 'ch'
104 * converted to UTF8 encoding. Calls the "WordHandler" for every
105 * word/line/end of a paragraph or for every 1023 characters,
106 * whichever comes first.
108 * ud : carries the UserData filled-in appropriately to hold the
109 * character (text) attributes.
111 * ch : unicode character
115 void
116 append_char (UserData * ud, U16 ch)
118 gchar tmpBuf[64];
119 int len = 0;
120 U8 bNeedStructBrk = 0;
122 if (ud->bIgnore)
123 return;
125 switch (ch) {
126 case 0x0B: /* hard line break */
127 case 0x0D: /* paragraph end */
128 case 0x0C:
129 case '\n': /* new-line */
130 bNeedStructBrk = 1;
131 ch = 0x00;
132 break;
134 case 0x20: /* space */
135 g_string_append_c (ud->txtWord, ch);
136 break;
137 default:
138 len = g_unichar_to_utf8 (ch, tmpBuf);
139 int i;
140 /* FIXME: This is not good, pretty hacky code
141 * to get rid of unwanted characters, especially
142 * some graphic symbols used in a document.
143 * Ex: a tick mark, a smiley blah blah blah...
144 * in a much sane way without blocking
145 * printable-non-iso characters ;)
148 for (i = 0; i < len; i++)
149 if (tmpBuf[i] > 0)
150 g_string_append_c (ud->txtWord, tmpBuf[i]);
152 g_string_append_len (ud->txtWord, tmpBuf, len);
153 break;
156 if (ch == 0x00 || ch == 0x20) {
157 if (ud->bWasHot)
158 g_string_append (ud->txtHotPool, ud->txtWord->str);
160 g_string_append (ud->txtPool, ud->txtWord->str);
162 /* printf ("TxtWord: %s\n", ud->txtWord->str);
163 printf ("TxtPool: %s\n", ud->txtPool->str);
164 printf ("HotTxtPool: %s\n", ud->txtHotPool->str);
167 if (bNeedStructBrk) {
168 g_string_append_c (ud->txtPool, '\n');
169 g_string_append_c (ud->txtHotPool, ' ');
170 ud->structBrkCount++;
173 if (ud->structBrkCount >= BUFFERED_STRUCT_BREAK) {
174 (*(ud->WordHandler))(ud->txtPool->str, ud->txtPool->len,
175 ud->txtHotPool->str, ud->txtHotPool->len, bNeedStructBrk);
176 g_string_erase (ud->txtPool, 0, -1);
177 g_string_erase (ud->txtHotPool, 0, -1);
178 ud->structBrkCount = 0;
180 g_string_erase (ud->txtWord, 0, -1);
181 ud->bWasHot = 0;
186 * fill_UserData: fills the UserData structure from the
187 * CHP structure that represents the Character Property
188 * Information like bold, italic, striked, underlined,
189 * superscript, subscript, fontsize, color, fontface etc.
192 void
193 fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
195 ud->cCol = 0;
196 if (chp->ico)
197 ud->cCol = chp->ico - 1;
199 ud->cFontSize = chp->hps;
200 ud->bIsBold = (chp->fBold);
201 ud->bIsItalic = (chp->fItalic);
202 ud->bIsUl = (chp->kul);
203 ud->bIsStrike = (chp->fStrike);
204 ud->bIsSup = (chp->iss == 1);
205 ud->bIsSub = (chp->iss == 2);
207 if ((ud->bIsBold
208 || ud->bIsItalic
209 || ud->bIsUl
210 || ud->bIsSup
211 || ud->bIsSub
212 || ud->bIsSplStyle) &&
213 (!ud->bIgnore))
214 ud->bIsHot = 1;
215 else
216 ud->bIsHot = 0;
219 /* This is a callback that handles the individual
220 * character that are extracted from M$ word file.
222 static int
223 charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
226 /* convert incoming character to unicode */
227 if (chartype) {
228 eachchar = wvHandleCodePage (eachchar, lid);
231 /* take care of any oddities in Microsoft's character "encoding" */
232 /* TODO: does the above code page handler take care of these? */
233 if (chartype == 1 && eachchar == 146)
234 eachchar = 39; /* apostrophe */
236 switch (eachchar)
238 case 14: /* column break */
239 break;
241 case 19: /* field begin */
242 /* flush current text buffer */
243 ps->fieldstate++;
244 ps->fieldmiddle = 0;
245 return 0;
246 case 20: /* field separator */
247 ps->fieldmiddle = 1;
248 return 0;
249 case 21: /* field end */
250 ps->fieldstate--;
251 ps->fieldmiddle = 0;
252 return 0;
253 case 7: /* Cell/Row mark, end of a cell/row*/
254 eachchar = 0x20;
255 break;
256 default:
257 break;
260 if (eachchar == 0x14)
261 return 0;
263 /* To handle partially-formatted-texts, Bug#157100,
264 * which is applicable to all word-processor-generated
265 * documents.
267 * ud->bIsHot is updated for every CHARPROPBEGIN element
268 * ud->bWasHot is updated on reading every *word*.
270 UserData *ud = (UserData *) ps->userData;
271 if (!ud->bWasHot)
272 ud->bWasHot = ud->bIsHot;
274 append_char (ps->userData, eachchar);
275 return 0;
278 /* This is a callback that handles the special
279 * character that are specific to M$ word file.
281 static int
282 specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
284 Blip blip;
285 wvStream *fil;
286 long pos;
287 FSPA *fspa;
288 PICF picf;
289 FDOA *fdoa;
291 switch (eachchar)
293 case 19: /* field begin */
294 ps->fieldstate++;
295 ps->fieldmiddle = 0;
296 return 0;
297 case 20: /* field separator */
298 if (achp->fOle2)
300 /* printf ("Field has an embedded OLE2 object\n"); */
302 ps->fieldmiddle = 1;
303 return 0;
304 case 21: /* field end */
305 ps->fieldstate--;
306 ps->fieldmiddle = 0;
307 return 0;
308 case 7: /* Cell/Row mark, end of a cell/row */
309 append_char (ps->userData, 0x20);
310 break;
311 default:
312 break;
315 if (ps->fieldstate)
317 if (eachchar == 0x13 || eachchar == 0x14)
318 return 0;
321 return 0;
324 /* This is a callback that handles the individual
325 * elements that are marked by libwv1.
328 static int
329 eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
331 /* some word structures */
332 PAP *apap;
333 CHP *achp;
334 SEP *asep;
335 int iRes;
337 UserData *ud = (UserData *) ps->userData;
339 switch (tag)
341 case PARABEGIN:
342 apap = (PAP *)props;
343 switch (ps->stsh.std[apap->istd].sti) {
344 case 29: /* Footnote Text */
345 case 30: /* Annotation text */
346 case 31: /* Header */
347 case 32: /* Footer */
348 case 33: /* Index Heading */
349 case 34: /* Caption */
350 case 43: /* Endnote Text */
351 case 62: /* Title */
352 case 74: /* Sub title */
353 ud->bIsSplStyle = 1;
354 break;
355 default:
356 ud->bIsSplStyle = 0;
357 break;
359 break;
361 case SECTIONEND:
362 append_char (ud, '\n');
363 break;
365 case PARAEND: /* pretty much nothing */
366 ud->bIsSplStyle = 0;
367 append_char (ud, '\n');
368 break;
370 case CHARPROPBEGIN:
371 achp = (CHP *) props;
372 /* switch (ps->stsh.std[achp->istd].sti) {
373 case 38:
374 case 39:
375 case 40:
376 case 41:
377 case 42:
378 ud->bIgnore = 1;
379 break;
380 default:
381 ud->bIgnore = 0;
382 break;
385 fill_UserData (ud, achp, ps);
386 break;
389 /* Do not call fill_UserData, as it resets the
390 * *Hot* flag in the ud structure.
392 case CHARPROPEND:
393 achp = (CHP *) props;
394 /*fill_UserData (ud, achp, ps);*/
395 break;
397 default:
398 break;
401 return 0;
404 /* This is a callback that handles the document
405 * level tags that are marked by libwv1.
408 static int
409 docProc (wvParseStruct * ps, wvTag tag)
411 UserData *ud = (UserData *) ps->userData;
412 switch (tag)
414 case DOCEND:
415 /* flush the text/hot pools at the EOD */
416 ud->structBrkCount = BUFFERED_STRUCT_BREAK;
417 append_char (ps->userData, 0x00);
418 break;
420 default:
421 break;
424 return 0;
428 * wv1_init (): Initialize the wv1 library
429 * NOTE: Do not call this more than once for an application.
433 wv1_init ()
435 return (wvInit());
440 * wv1_glue_init_doc_parsing: Initiates the document parsing
441 * procedure. Sets up all the required handlers and the parser.
443 * fname: Name of the file to parse. (essentially a M$ word file)
445 * wvTextHandlerCallback: The callback routine that will be called
446 * on extraction of each word.
448 * Return: 0 -> success
449 * -1 -> failure.
453 wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
455 FILE *input;
456 int ret = 0;
458 wvParseStruct ps;
459 char *dir = NULL;
461 UserData ud;
463 input = fopen (fname, "rb");
464 if (!input)
465 return -1;
466 fclose (input);
468 ret = wvInitParser (&ps, fname);
469 if (ret & 0x8000)
470 ret = -2;
471 else if (ret)
472 ret = -3;
474 if (ret) {
475 wvOLEFree (&ps);
476 return ret;
479 ps.filename = fname;
480 ps.dir = dir;
482 /* set to 0 */
483 memset (&ud, 0, sizeof (UserData));
484 ud.WordHandler = callback;
485 ud.txtWord = g_string_sized_new (32);
486 ud.txtHotPool = g_string_sized_new (1024);
487 ud.txtPool = g_string_sized_new (1024);
488 ps.userData = &ud;
490 wvSetElementHandler (&ps, eleProc);
491 wvSetDocumentHandler (&ps, docProc);
492 wvSetCharHandler (&ps, charProc);
493 wvSetSpecialCharHandler (&ps, specCharProc);
495 wvText (&ps);
497 /* free associated memory */
498 wvOLEFree (&ps);
500 /* free userdata memory */
501 g_string_free (ud.txtWord, TRUE);
503 /* free text pool memory */
504 g_string_free (ud.txtPool, TRUE);
506 /* free hot text pool memory */
507 g_string_free (ud.txtHotPool, TRUE);
509 return 0;
512 void *
513 wv1_glue_get_ole_stream (const char* fname)
515 MsOle *ole = NULL;
516 ms_ole_open (&ole, fname);
517 return ((void *)ole);
520 void *
521 wv1_glue_get_ole_summary_stream (MsOle *stream)
523 MsOle *oleStream = (MsOle *)stream;
524 MsOleSummary *summary = NULL;
525 summary = ms_ole_summary_open (oleStream);
526 return ((void *)summary);
529 char *
530 wv1_glue_get_title (MsOleSummary* smryStream)
532 int ret;
533 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TITLE, &ret));
536 char *
537 wv1_glue_get_subject (MsOleSummary* smryStream)
539 int ret;
540 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_SUBJECT, &ret));
543 char *
544 wv1_glue_get_author (MsOleSummary* smryStream)
546 int ret;
547 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_AUTHOR, &ret));
550 char *
551 wv1_glue_get_keywords (MsOleSummary* smryStream)
553 int ret;
554 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_KEYWORDS, &ret));
557 char *
558 wv1_glue_get_comments (MsOleSummary* smryStream)
560 int ret;
561 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_COMMENTS, &ret));
564 char *
565 wv1_glue_get_template (MsOleSummary* smryStream)
567 int ret;
568 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TEMPLATE, &ret));
571 char *
572 wv1_glue_get_lastsavedby (MsOleSummary* smryStream)
574 int ret;
575 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_LASTAUTHOR, &ret));
578 char *
579 wv1_glue_get_revision_number (MsOleSummary* smryStream)
581 int ret;
582 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_REVNUMBER, &ret));
585 char *
586 wv1_glue_get_appname (MsOleSummary* smryStream)
588 int ret;
589 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_APPNAME, &ret));
592 long
593 wv1_glue_get_page_count (MsOleSummary* smryStream)
595 int ret;
596 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_PAGECOUNT, &ret));
599 long
600 wv1_glue_get_word_count (MsOleSummary* smryStream)
602 int ret;
603 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_WORDCOUNT, &ret));
606 long
607 wv1_glue_get_character_count (MsOleSummary* smryStream)
609 int ret;
610 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_CHARCOUNT, &ret));
613 long
614 wv1_glue_get_security (MsOleSummary* smryStream)
616 int ret;
617 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_SECURITY, &ret));
620 short
621 wv1_glue_get_codepage (MsOleSummary* smryStream)
623 int ret;
624 return (ms_ole_summary_get_short (smryStream, MS_OLE_SUMMARY_CODEPAGE, &ret));
627 void
628 wv1_glue_close_stream (MsOle* oleStream, MsOleSummary* summary)
630 ms_ole_summary_close (summary);
631 ms_ole_destroy (&oleStream);