2 * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
3 * Microsoft Word documents).
5 * Copyright (C) 2004 Novell, Inc.
7 * Author: Veerapuram Varadhan <vvaradhan@novell.com>
8 * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
13 * Permission is hereby granted, free of charge, to any person obtaining a
14 * copy of this software and associated documentation files (the "Software"),
15 * to deal in the Software without restriction, including without limitation
16 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 * and/or sell copies of the Software, and to permit persons to whom the
18 * Software is furnished to do so, subject to the following conditions:
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 * DEALINGS IN THE SOFTWARE.
35 /* Number of structural-break'ed text-chunks to hold
36 * in the text/hot pools, before sending them for
37 * indexing. Increasing the number will give good
38 * performance w.r.t for indexing, however, may take
39 * large-chunk-of-memory to hold that much data and
40 * depends on the length of each structurally-broken
43 #define BUFFERED_STRUCT_BREAK 12
46 /* Callback to Handle "text" (or words) extracted out of
49 * text: Holds the extracted text/words.
51 * hotText: Identifies the attributes of the text.
52 * (bold, italic, underline, superscript, subscript)
55 typedef void (* wvTextHandlerCallback
) (U8
* text
, int len
,
56 U8
* hotText
, int hotLen
,
59 typedef struct _UserData
{
60 /* formatting variables */
75 /* beagle specifc formats */
78 /* beagle specifc formats - for partially formatted
83 /* buffer to hold text */
86 /* buffer to hold hot-pool-text */
89 /* buffer to hold normal-pool-text */
92 /* hold number of "structural breaks" encountered
93 * since last-update-to-filter.
97 wvTextHandlerCallback WordHandler
;
103 * append_char: fills the txtWord buffer with the character 'ch'
104 * converted to UTF8 encoding. Calls the "WordHandler" for every
105 * word/line/end of a paragraph or for every 1023 characters,
106 * whichever comes first.
108 * ud : carries the UserData filled-in appropriately to hold the
109 * character (text) attributes.
111 * ch : unicode character
116 append_char (UserData
* ud
, U16 ch
)
120 U8 bNeedStructBrk
= 0;
126 case 0x0B: /* hard line break */
127 case 0x0D: /* paragraph end */
129 case '\n': /* new-line */
134 case 0x20: /* space */
135 g_string_append_c (ud
->txtWord
, ch
);
138 len
= g_unichar_to_utf8 (ch
, tmpBuf
);
140 /* FIXME: This is not good, pretty hacky code
141 * to get rid of unwanted characters, especially
142 * some graphic symbols used in a document.
143 * Ex: a tick mark, a smiley blah blah blah...
144 * in a much sane way without blocking
145 * printable-non-iso characters ;)
148 for (i = 0; i < len; i++)
150 g_string_append_c (ud->txtWord, tmpBuf[i]);
152 g_string_append_len (ud
->txtWord
, tmpBuf
, len
);
156 if (ch
== 0x00 || ch
== 0x20) {
158 g_string_append (ud
->txtHotPool
, ud
->txtWord
->str
);
160 g_string_append (ud
->txtPool
, ud
->txtWord
->str
);
162 /* printf ("TxtWord: %s\n", ud->txtWord->str);
163 printf ("TxtPool: %s\n", ud->txtPool->str);
164 printf ("HotTxtPool: %s\n", ud->txtHotPool->str);
167 if (bNeedStructBrk
) {
168 g_string_append_c (ud
->txtPool
, '\n');
169 g_string_append_c (ud
->txtHotPool
, ' ');
170 ud
->structBrkCount
++;
173 if (ud
->structBrkCount
>= BUFFERED_STRUCT_BREAK
) {
174 (*(ud
->WordHandler
))(ud
->txtPool
->str
, ud
->txtPool
->len
,
175 ud
->txtHotPool
->str
, ud
->txtHotPool
->len
, bNeedStructBrk
);
176 g_string_erase (ud
->txtPool
, 0, -1);
177 g_string_erase (ud
->txtHotPool
, 0, -1);
178 ud
->structBrkCount
= 0;
180 g_string_erase (ud
->txtWord
, 0, -1);
186 * fill_UserData: fills the UserData structure from the
187 * CHP structure that represents the Character Property
188 * Information like bold, italic, striked, underlined,
189 * superscript, subscript, fontsize, color, fontface etc.
193 fill_UserData (UserData
* ud
, CHP
* chp
, wvParseStruct
* ps
)
197 ud
->cCol
= chp
->ico
- 1;
199 ud
->cFontSize
= chp
->hps
;
200 ud
->bIsBold
= (chp
->fBold
);
201 ud
->bIsItalic
= (chp
->fItalic
);
202 ud
->bIsUl
= (chp
->kul
);
203 ud
->bIsStrike
= (chp
->fStrike
);
204 ud
->bIsSup
= (chp
->iss
== 1);
205 ud
->bIsSub
= (chp
->iss
== 2);
212 || ud
->bIsSplStyle
) &&
219 /* This is a callback that handles the individual
220 * character that are extracted from M$ word file.
223 charProc (wvParseStruct
* ps
, U16 eachchar
, U8 chartype
, U16 lid
)
226 /* convert incoming character to unicode */
228 eachchar
= wvHandleCodePage (eachchar
, lid
);
231 /* take care of any oddities in Microsoft's character "encoding" */
232 /* TODO: does the above code page handler take care of these? */
233 if (chartype
== 1 && eachchar
== 146)
234 eachchar
= 39; /* apostrophe */
238 case 14: /* column break */
241 case 19: /* field begin */
242 /* flush current text buffer */
246 case 20: /* field separator */
249 case 21: /* field end */
253 case 7: /* Cell/Row mark, end of a cell/row*/
260 if (eachchar
== 0x14)
263 /* To handle partially-formatted-texts, Bug#157100,
264 * which is applicable to all word-processor-generated
267 * ud->bIsHot is updated for every CHARPROPBEGIN element
268 * ud->bWasHot is updated on reading every *word*.
270 UserData
*ud
= (UserData
*) ps
->userData
;
272 ud
->bWasHot
= ud
->bIsHot
;
274 append_char (ps
->userData
, eachchar
);
278 /* This is a callback that handles the special
279 * character that are specific to M$ word file.
282 specCharProc (wvParseStruct
* ps
, U16 eachchar
, CHP
* achp
)
293 case 19: /* field begin */
297 case 20: /* field separator */
300 /* printf ("Field has an embedded OLE2 object\n"); */
304 case 21: /* field end */
308 case 7: /* Cell/Row mark, end of a cell/row */
309 append_char (ps
->userData
, 0x20);
317 if (eachchar
== 0x13 || eachchar
== 0x14)
324 /* This is a callback that handles the individual
325 * elements that are marked by libwv1.
329 eleProc (wvParseStruct
* ps
, wvTag tag
, void *props
, int dirty
)
331 /* some word structures */
337 UserData
*ud
= (UserData
*) ps
->userData
;
343 switch (ps
->stsh
.std
[apap
->istd
].sti
) {
344 case 29: /* Footnote Text */
345 case 30: /* Annotation text */
346 case 31: /* Header */
347 case 32: /* Footer */
348 case 33: /* Index Heading */
349 case 34: /* Caption */
350 case 43: /* Endnote Text */
352 case 74: /* Sub title */
362 append_char (ud
, '\n');
365 case PARAEND
: /* pretty much nothing */
367 append_char (ud
, '\n');
371 achp
= (CHP
*) props
;
372 /* switch (ps->stsh.std[achp->istd].sti) {
385 fill_UserData (ud
, achp
, ps
);
389 /* Do not call fill_UserData, as it resets the
390 * *Hot* flag in the ud structure.
393 achp
= (CHP
*) props
;
394 /*fill_UserData (ud, achp, ps);*/
404 /* This is a callback that handles the document
405 * level tags that are marked by libwv1.
409 docProc (wvParseStruct
* ps
, wvTag tag
)
411 UserData
*ud
= (UserData
*) ps
->userData
;
415 /* flush the text/hot pools at the EOD */
416 ud
->structBrkCount
= BUFFERED_STRUCT_BREAK
;
417 append_char (ps
->userData
, 0x00);
428 * wv1_init (): Initialize the wv1 library
429 * NOTE: Do not call this more than once for an application.
440 * wv1_glue_init_doc_parsing: Initiates the document parsing
441 * procedure. Sets up all the required handlers and the parser.
443 * fname: Name of the file to parse. (essentially a M$ word file)
445 * wvTextHandlerCallback: The callback routine that will be called
446 * on extraction of each word.
448 * Return: 0 -> success
453 wv1_glue_init_doc_parsing (char* fname
, wvTextHandlerCallback callback
)
463 input
= fopen (fname
, "rb");
468 ret
= wvInitParser (&ps
, fname
);
483 memset (&ud
, 0, sizeof (UserData
));
484 ud
.WordHandler
= callback
;
485 ud
.txtWord
= g_string_sized_new (32);
486 ud
.txtHotPool
= g_string_sized_new (1024);
487 ud
.txtPool
= g_string_sized_new (1024);
490 wvSetElementHandler (&ps
, eleProc
);
491 wvSetDocumentHandler (&ps
, docProc
);
492 wvSetCharHandler (&ps
, charProc
);
493 wvSetSpecialCharHandler (&ps
, specCharProc
);
497 /* free associated memory */
500 /* free userdata memory */
501 g_string_free (ud
.txtWord
, TRUE
);
503 /* free text pool memory */
504 g_string_free (ud
.txtPool
, TRUE
);
506 /* free hot text pool memory */
507 g_string_free (ud
.txtHotPool
, TRUE
);
513 wv1_glue_get_ole_stream (const char* fname
)
516 ms_ole_open (&ole
, fname
);
517 return ((void *)ole
);
521 wv1_glue_get_ole_summary_stream (MsOle
*stream
)
523 MsOle
*oleStream
= (MsOle
*)stream
;
524 MsOleSummary
*summary
= NULL
;
525 summary
= ms_ole_summary_open (oleStream
);
526 return ((void *)summary
);
530 wv1_glue_get_title (MsOleSummary
* smryStream
)
533 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_TITLE
, &ret
));
537 wv1_glue_get_subject (MsOleSummary
* smryStream
)
540 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_SUBJECT
, &ret
));
544 wv1_glue_get_author (MsOleSummary
* smryStream
)
547 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_AUTHOR
, &ret
));
551 wv1_glue_get_keywords (MsOleSummary
* smryStream
)
554 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_KEYWORDS
, &ret
));
558 wv1_glue_get_comments (MsOleSummary
* smryStream
)
561 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_COMMENTS
, &ret
));
565 wv1_glue_get_template (MsOleSummary
* smryStream
)
568 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_TEMPLATE
, &ret
));
572 wv1_glue_get_lastsavedby (MsOleSummary
* smryStream
)
575 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_LASTAUTHOR
, &ret
));
579 wv1_glue_get_revision_number (MsOleSummary
* smryStream
)
582 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_REVNUMBER
, &ret
));
586 wv1_glue_get_appname (MsOleSummary
* smryStream
)
589 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_APPNAME
, &ret
));
593 wv1_glue_get_page_count (MsOleSummary
* smryStream
)
596 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_PAGECOUNT
, &ret
));
600 wv1_glue_get_word_count (MsOleSummary
* smryStream
)
603 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_WORDCOUNT
, &ret
));
607 wv1_glue_get_character_count (MsOleSummary
* smryStream
)
610 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_CHARCOUNT
, &ret
));
614 wv1_glue_get_security (MsOleSummary
* smryStream
)
617 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_SECURITY
, &ret
));
621 wv1_glue_get_codepage (MsOleSummary
* smryStream
)
624 return (ms_ole_summary_get_short (smryStream
, MS_OLE_SUMMARY_CODEPAGE
, &ret
));
628 wv1_glue_close_stream (MsOle
* oleStream
, MsOleSummary
* summary
)
630 ms_ole_summary_close (summary
);
631 ms_ole_destroy (&oleStream
);