2 * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
3 * Microsoft Word documents).
5 * Copyright (C) 2004 Novell, Inc.
7 * Author: Veerapuram Varadhan <vvaradhan@novell.com>
8 * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
13 * Permission is hereby granted, free of charge, to any person obtaining a
14 * copy of this software and associated documentation files (the "Software"),
15 * to deal in the Software without restriction, including without limitation
16 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 * and/or sell copies of the Software, and to permit persons to whom the
18 * Software is furnished to do so, subject to the following conditions:
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 * DEALINGS IN THE SOFTWARE.
34 /* Callback to Handle "text" (or words) extracted out of
37 * text: Holds the extracted text/words.
39 * hotText: Identifies the attributes of the text.
40 * (bold, italic, underline, superscript, subscript)
43 typedef void (* wvTextHandlerCallback
) (U8
* text
, int len
, U8 hotText
);
45 typedef struct _UserData
{
46 /* formatting variables */
59 /* beagle specifc formats */
62 /* buffer to hold text */
65 wvTextHandlerCallback WordHandler
;
71 * append_char: fills the txtWord buffer with the character 'ch'
72 * converted to UTF8 encoding. Calls the "WordHandler" for every
73 * word/line/end of a paragraph or for every 1023 characters,
74 * whichever comes first.
76 * ud : carries the UserData filled-in appropriately to hold the
77 * character (text) attributes.
79 * ch : unicode character
84 append_char (UserData
* ud
, U16 ch
)
91 case 0x20: /* space */
92 case 0x0B: /* hard line break */
93 case 0x0D: /* paragraph end */
95 case '\n': /* new-line */
100 g_string_append_c (ud
->txtWord
, ch
);
103 len
= g_unichar_to_utf8 (ch
, tmpBuf
);
104 g_string_append_len (ud
->txtWord
, tmpBuf
, len
);
107 if (ch
== 0x00 || ch
== '\n' || ch
== 0x20) {
108 (*(ud
->WordHandler
))(ud
->txtWord
->str
, ud
->txtWord
->len
, ud
->bIsHot
);
109 g_string_erase (ud
->txtWord
, 0, -1);
114 * fill_UserData: fills the UserData structure from the
115 * CHP structure that represents the Character Property
116 * Information like bold, italic, striked, underlined,
117 * superscript, subscript, fontsize, color, fontface etc.
121 fill_UserData (UserData
* ud
, CHP
* chp
, wvParseStruct
* ps
)
125 ud
->cCol
= chp
->ico
- 1;
127 ud
->cFontSize
= chp
->hps
;
128 ud
->bIsBold
= (chp
->fBold
);
129 ud
->bIsItalic
= (chp
->fItalic
);
130 ud
->bIsUl
= (chp
->kul
);
131 ud
->bIsStrike
= (chp
->fStrike
);
132 ud
->bIsSup
= (chp
->iss
== 1);
133 ud
->bIsSub
= (chp
->iss
== 2);
135 if (ud
->bIsBold
|| ud
->bIsItalic
|| ud
->bIsUl
|| ud
->bIsSup
|| ud
->bIsSub
)
141 /* This is a callback that handles the individual
142 * character that are extracted from M$ word file.
145 charProc (wvParseStruct
* ps
, U16 eachchar
, U8 chartype
, U16 lid
)
148 /* convert incoming character to unicode */
150 eachchar
= wvHandleCodePage (eachchar
, lid
);
153 /* take care of any oddities in Microsoft's character "encoding" */
154 /* TODO: does the above code page handler take care of these? */
155 if (chartype
== 1 && eachchar
== 146)
156 eachchar
= 39; /* apostrophe */
160 case 14: /* column break */
163 case 19: /* field begin */
164 /* flush current text buffer */
168 case 20: /* field separator */
171 case 21: /* field end */
180 if (eachchar
== 0x14)
183 append_char (ps
->userData
, eachchar
);
187 /* This is a callback that handles the special
188 * character that are specific to M$ word file.
191 specCharProc (wvParseStruct
* ps
, U16 eachchar
, CHP
* achp
)
202 case 19: /* field begin */
206 case 20: /* field separator */
209 /* printf ("Field has an embedded OLE2 object\n"); */
213 case 21: /* field end */
223 if (eachchar
== 0x13 || eachchar
== 0x14)
230 /* This is a callback that handles the individual
231 * elements that are marked by libwv1.
235 eleProc (wvParseStruct
* ps
, wvTag tag
, void *props
, int dirty
)
237 /* some word structures */
243 UserData
*ud
= (UserData
*) ps
->userData
;
248 case PARAEND
: /* pretty much nothing */
249 append_char (ud
, '\n');
253 achp
= (CHP
*) props
;
254 fill_UserData (ud
, achp
, ps
);
258 achp
= (CHP
*) props
;
259 fill_UserData (ud
, achp
, ps
);
269 /* This is a callback that handles the document
270 * level tags that are marked by libwv1.
274 docProc (wvParseStruct
* ps
, wvTag tag
)
279 append_char (ps
->userData
, 0x00);
290 * wv1_glue_init_doc_parsing: Initiates the document parsing
291 * procedure. Sets up all the required handlers and the parser.
293 * fname: Name of the file to parse. (essentially a M$ word file)
295 * wvTextHandlerCallback: The callback routine that will be called
296 * on extraction of each word.
298 * Return: 0 -> success
303 wv1_glue_init_doc_parsing (char* fname
, wvTextHandlerCallback callback
)
313 input
= fopen (fname
, "rb");
319 ret
= wvInitParser (&ps
, fname
);
329 memset (&ud
, 0, sizeof (UserData
));
330 ud
.WordHandler
= callback
;
331 ud
.txtWord
= g_string_sized_new (32);
335 wvSetElementHandler (&ps
, eleProc
);
336 wvSetDocumentHandler (&ps
, docProc
);
337 wvSetCharHandler (&ps
, charProc
);
338 wvSetSpecialCharHandler (&ps
, specCharProc
);
342 /* free associated memory */
345 /* free userdata memory */
346 g_string_free (ud
.txtWord
, TRUE
);
352 wv1_glue_get_ole_stream (const char* fname
)
355 ms_ole_open (&ole
, fname
);
356 return ((void *)ole
);
360 wv1_glue_get_ole_summary_stream (MsOle
*stream
)
362 MsOle
*oleStream
= (MsOle
*)stream
;
363 MsOleSummary
*summary
= NULL
;
364 summary
= ms_ole_summary_open (oleStream
);
365 return ((void *)summary
);
369 wv1_glue_get_title (MsOleSummary
* smryStream
)
372 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_TITLE
, &ret
));
376 wv1_glue_get_subject (MsOleSummary
* smryStream
)
379 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_SUBJECT
, &ret
));
383 wv1_glue_get_author (MsOleSummary
* smryStream
)
386 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_AUTHOR
, &ret
));
390 wv1_glue_get_keywords (MsOleSummary
* smryStream
)
393 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_KEYWORDS
, &ret
));
397 wv1_glue_get_comments (MsOleSummary
* smryStream
)
400 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_COMMENTS
, &ret
));
404 wv1_glue_get_template (MsOleSummary
* smryStream
)
407 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_TEMPLATE
, &ret
));
411 wv1_glue_get_lastsavedby (MsOleSummary
* smryStream
)
414 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_LASTAUTHOR
, &ret
));
418 wv1_glue_get_revision_number (MsOleSummary
* smryStream
)
421 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_REVNUMBER
, &ret
));
425 wv1_glue_get_appname (MsOleSummary
* smryStream
)
428 return (ms_ole_summary_get_string (smryStream
, MS_OLE_SUMMARY_APPNAME
, &ret
));
432 wv1_glue_get_page_count (MsOleSummary
* smryStream
)
435 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_PAGECOUNT
, &ret
));
439 wv1_glue_get_word_count (MsOleSummary
* smryStream
)
442 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_WORDCOUNT
, &ret
));
446 wv1_glue_get_character_count (MsOleSummary
* smryStream
)
449 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_CHARCOUNT
, &ret
));
453 wv1_glue_get_security (MsOleSummary
* smryStream
)
456 return (ms_ole_summary_get_long (smryStream
, MS_OLE_SUMMARY_SECURITY
, &ret
));
460 wv1_glue_get_codepage (MsOleSummary
* smryStream
)
463 return (ms_ole_summary_get_short (smryStream
, MS_OLE_SUMMARY_CODEPAGE
, &ret
));
467 wv1_glue_close_stream (MsOle
* oleStream
, MsOleSummary
* summary
)
469 ms_ole_summary_close (summary
);
470 ms_ole_destroy (&oleStream
);