2 * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
3 * Microsoft Word documents).
5 * Copyright (C) 2004 Novell, Inc.
7 * Author: Veerapuram Varadhan <vvaradhan@novell.com>
8 * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
13 * Permission is hereby granted, free of charge, to any person obtaining a
14 * copy of this software and associated documentation files (the "Software"),
15 * to deal in the Software without restriction, including without limitation
16 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 * and/or sell copies of the Software, and to permit persons to whom the
18 * Software is furnished to do so, subject to the following conditions:
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 * DEALINGS IN THE SOFTWARE.
35 /* Number of structural-break'ed text-chunks to hold
36 * in the text/hot pools, before sending them for
37 * indexing. Increasing the number will give good
38 * performance w.r.t for indexing, however, may take
39 * large-chunk-of-memory to hold that much data and
40 * depends on the length of each structurally-broken
43 #define BUFFERED_STRUCT_BREAK 12
46 /* Callback to Handle "text" (or words) extracted out of
49 * text: Holds the extracted text/words.
51 * hotText: Identifies the attributes of the text.
52 * (bold, italic, underline, superscript, subscript)
55 typedef void (* wvTextHandlerCallback
) (U8
* text
, int len
,
56 U8
* hotText
, int hotLen
,
59 typedef struct _UserData
{
60 /* formatting variables */
75 /* beagle specifc formats */
78 /* beagle specifc formats - for partially formatted
84 specifies end of para, used to send data to managed code
88 /* buffer to hold text */
91 /* buffer to hold hot-pool-text */
94 /* buffer to hold normal-pool-text */
97 /* hold number of "structural breaks" encountered
98 * since last-update-to-filter.
100 short structBrkCount
;
102 wvTextHandlerCallback WordHandler
;
108 * append_char: fills the txtWord buffer with the character 'ch'
109 * converted to UTF8 encoding. Calls the "WordHandler" for every
110 * word/line/end of a paragraph or for every 1023 characters,
111 * whichever comes first.
113 * ud : carries the UserData filled-in appropriately to hold the
114 * character (text) attributes.
116 * ch : unicode character
121 append_char (UserData
* ud
, U16 ch
)
125 U8 bNeedStructBrk
= 0;
131 case 0x00: /* End of Document */
135 case 0x0B: /* hard line break */
136 case 0x0D: /* paragraph end */
138 case '\n': /* new-line */
143 case 0x20: /* space */
144 g_string_append_c (ud
->txtWord
, ch
);
148 len
= g_unichar_to_utf8 (ch
, tmpBuf
);
149 /* FIXME: This is not good, pretty hacky code
150 * to get rid of unwanted characters, especially
151 * some graphic symbols used in a document.
152 * Ex: a tick mark, a smiley blah blah blah...
153 * in a much sane way without blocking
154 * printable-non-iso characters ;)
158 for (i = 0; i < len; i++)
160 g_string_append_c (ud->txtWord, tmpBuf[i]);
162 g_string_append_len (ud
->txtWord
, tmpBuf
, len
);
166 if (ch
== 0x00 || ch
== 0x20 || ch
== 0x0A) {
168 g_string_append_len (ud
->txtHotPool
, ud
->txtWord
->str
, ud
->txtWord
->len
);
171 printf ("TxtWord: %s, len: %d\n", ud->txtWord->str, ud->txtWord->len);
172 printf ("TxtPool: %s, len: %d\n", ud->txtPool->str, ud->txtPool->len);
173 printf ("HotTxtPool: %s, len: %d\n", ud->txtHotPool->str, ud->txtHotPool->len);
176 g_string_append_len (ud
->txtPool
, ud
->txtWord
->str
, ud
->txtWord
->len
);
177 if (bNeedStructBrk
) {
178 g_string_append_c (ud
->txtPool
, '\n');
179 g_string_append_c (ud
->txtHotPool
, ' ');
180 ud
->structBrkCount
++;
183 if (ud
->structBrkCount
>= BUFFERED_STRUCT_BREAK
||
185 (*(ud
->WordHandler
))(ud
->txtPool
->str
, ud
->txtPool
->len
,
186 ud
->txtHotPool
->str
, ud
->txtHotPool
->len
, bNeedStructBrk
);
188 g_string_erase () can be used here to erase
189 the previous content, however, using this
190 call will free the "erased-content-memory"
191 and thereby causing memory fragmentation for
192 every time we transfer data from unmanaged
193 to managed code. Setting "len" to 0 results
194 in the same way g_string_erase () does, but
195 doesn't do memory-[de/re]allocation
199 ch == 0x00 refers to EOD. Do not reset len to
200 zero, we have to free the gstrings.
203 ud
->txtPool
->len
= 0;
204 ud
->txtHotPool
->len
= 0;
205 ud
->structBrkCount
= 0;
209 ud
->txtWord
->len
= 0;
215 * fill_UserData: fills the UserData structure from the
216 * CHP structure that represents the Character Property
217 * Information like bold, italic, striked, underlined,
218 * superscript, subscript, fontsize, color, fontface etc.
222 fill_UserData (UserData
* ud
, CHP
* chp
, wvParseStruct
* ps
)
229 ud
->cCol
= chp
->ico
- 1;
231 ud
->cFontSize
= chp
->hps
;
232 ud
->bIsBold
= (chp
->fBold
);
233 ud
->bIsItalic
= (chp
->fItalic
);
234 ud
->bIsUl
= (chp
->kul
);
235 ud
->bIsStrike
= (chp
->fStrike
);
236 ud
->bIsSup
= (chp
->iss
== 1);
237 ud
->bIsSub
= (chp
->iss
== 2);
244 || ud
->bIsSplStyle
) &&
251 /* This is a callback that handles the individual
252 * character that are extracted from M$ word file.
255 charProc (wvParseStruct
* ps
, U16 eachchar
, U8 chartype
, U16 lid
)
257 /* convert incoming character to unicode */
259 eachchar
= wvHandleCodePage (eachchar
, lid
);
262 /* take care of any oddities in Microsoft's character "encoding" */
263 /* TODO: does the above code page handler take care of these? */
264 if (chartype
== 1 && eachchar
== 146)
265 eachchar
= 39; /* apostrophe */
269 case 14: /* column break */
272 case 19: /* field begin */
273 /* flush current text buffer */
277 case 20: /* field separator */
280 case 21: /* field end */
284 case 7: /* Cell/Row mark, end of a cell/row*/
291 if (eachchar
== 0x14)
294 /* To handle partially-formatted-texts, Bug#157100,
295 * which is applicable to all word-processor-generated
298 * ud->bIsHot is updated for every CHARPROPBEGIN element
299 * ud->bWasHot is updated on reading every *word*.
301 UserData
*ud
= (UserData
*) ps
->userData
;
303 ud
->bWasHot
= ud
->bIsHot
;
305 append_char (ps
->userData
, eachchar
);
309 /* This is a callback that handles the special
310 * character that are specific to M$ word file.
313 specCharProc (wvParseStruct
* ps
, U16 eachchar
, CHP
* achp
)
324 case 19: /* field begin */
328 case 20: /* field separator */
331 /* printf ("Field has an embedded OLE2 object\n"); */
335 case 21: /* field end */
339 case 7: /* Cell/Row mark, end of a cell/row */
340 append_char (ps
->userData
, 0x20);
348 if (eachchar
== 0x13 || eachchar
== 0x14)
355 /* This is a callback that handles the individual
356 * elements that are marked by libwv1.
360 eleProc (wvParseStruct
* ps
, wvTag tag
, void *props
, int dirty
)
362 /* some word structures */
368 UserData
*ud
= (UserData
*) ps
->userData
;
374 switch (ps
->stsh
.std
[apap
->istd
].sti
) {
375 case 29: /* Footnote Text */
376 case 30: /* Annotation text */
377 case 31: /* Header */
378 case 32: /* Footer */
379 case 33: /* Index Heading */
380 case 34: /* Caption */
381 case 43: /* Endnote Text */
383 case 74: /* Sub title */
394 append_char (ud
, '\n');
397 case PARAEND
: /* pretty much nothing */
400 append_char (ud
, '\n');
404 achp
= (CHP
*) props
;
405 /* switch (ps->stsh.std[achp->istd].sti) {
418 fill_UserData (ud
, achp
, ps
);
422 /* Do not call fill_UserData, as it resets the
423 * *Hot* flag in the ud structure.
426 achp
= (CHP
*) props
;
427 /*fill_UserData (ud, achp, ps);*/
437 /* This is a callback that handles the document
438 * level tags that are marked by libwv1.
442 docProc (wvParseStruct
* ps
, wvTag tag
)
444 UserData
*ud
= (UserData
*) ps
->userData
;
449 /* flush the text/hot pools at the EOD */
450 ud
->structBrkCount
= BUFFERED_STRUCT_BREAK
;
451 append_char (ps
->userData
, 0x00);
463 * wv1_glue_init_doc_parsing: Initiates the document parsing
464 * procedure. Sets up all the required handlers and the parser.
466 * fname: Name of the file to parse. (essentially a M$ word file)
468 * wvTextHandlerCallback: The callback routine that will be called
469 * on extraction of each word.
471 * Return: 0 -> success
476 wv1_glue_init_doc_parsing (char* fname
, wvTextHandlerCallback callback
)
486 input
= fopen (fname
, "rb");
491 ret
= wvInitParser (&ps
, fname
);
506 memset (&ud
, 0, sizeof (UserData
));
507 ud
.WordHandler
= callback
;
508 ud
.txtWord
= g_string_sized_new (32);
509 ud
.txtHotPool
= g_string_sized_new (1024);
510 ud
.txtPool
= g_string_sized_new (1024);
513 wvSetElementHandler (&ps
, eleProc
);
514 wvSetDocumentHandler (&ps
, docProc
);
515 wvSetCharHandler (&ps
, charProc
);
516 wvSetSpecialCharHandler (&ps
, specCharProc
);
520 /* free userdata memory */
521 g_string_free (ud
.txtWord
, TRUE
);
523 /* free text pool memory */
524 g_string_free (ud
.txtPool
, TRUE
);
526 /* free hot text pool memory */
527 g_string_free (ud
.txtHotPool
, TRUE
);
529 /* free associated memory */
534 ud
.txtHotPool
= NULL
;
540 * wv1_init (): Initialize the wv1 library
541 * NOTE: Do not call this more than once for an application.