RTF filter complies to MS RTF 1.5 specification. (works well with 1.8 as well).
[beagle.git] / glue / wv1-glue.c
blob989c791f24247954842b42cf47f5e2398b5d48b6
1 /*
2 * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
3 * Microsoft Word documents).
5 * Copyright (C) 2004 Novell, Inc.
7 * Author: Veerapuram Varadhan <vvaradhan@novell.com>
8 * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
13 * Permission is hereby granted, free of charge, to any person obtaining a
14 * copy of this software and associated documentation files (the "Software"),
15 * to deal in the Software without restriction, including without limitation
16 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
17 * and/or sell copies of the Software, and to permit persons to whom the
18 * Software is furnished to do so, subject to the following conditions:
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
28 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29 * DEALINGS IN THE SOFTWARE.
32 #include <wv.h>
34 /* Callback to Handle "text" (or words) extracted out of
35 * M$ Word documents
37 * text: Holds the extracted text/words.
39 * hotText: Identifies the attributes of the text.
40 * (bold, italic, underline, superscript, subscript)
43 typedef void (* wvTextHandlerCallback) (U8* text, int len, U8 hotText);
45 typedef struct _UserData {
46 /* formatting variables */
48 int cFontSize;
49 int cCol;
51 /* boolean formats */
52 int bIsBold:1;
53 int bIsItalic:1;
54 int bIsStrike:1;
55 int bIsUl:1;
56 int bIsSup:1;
57 int bIsSub:1;
59 /* beagle specifc formats */
60 U8 bIsHot;
62 /* buffer to hold text */
63 GString* txtWord;
65 wvTextHandlerCallback WordHandler;
67 } UserData;
71 * append_char: fills the txtWord buffer with the character 'ch'
72 * converted to UTF8 encoding. Calls the "WordHandler" for every
73 * word/line/end of a paragraph or for every 1023 characters,
74 * whichever comes first.
76 * ud : carries the UserData filled-in appropriately to hold the
77 * character (text) attributes.
79 * ch : unicode character
83 void
84 append_char (UserData * ud, U16 ch)
86 int hotText;
87 char tmpBuf[64];
88 int len = 0;
90 switch (ch) {
91 case 0x20: /* space */
92 case 0x0B: /* hard line break */
93 case 0x0D: /* paragraph end */
94 case 0x0C:
95 case '\n': /* new-line */
96 if (ch != '\n')
97 ch = 0x20;
98 else
99 ch = '\n';
100 g_string_append_c (ud->txtWord, ch);
101 break;
102 default:
103 len = g_unichar_to_utf8 (ch, tmpBuf);
104 g_string_append_len (ud->txtWord, tmpBuf, len);
105 break;
107 if (ch == 0x00 || ch == '\n' || ch == 0x20) {
108 (*(ud->WordHandler))(ud->txtWord->str, ud->txtWord->len, ud->bIsHot);
109 g_string_erase (ud->txtWord, 0, -1);
114 * fill_UserData: fills the UserData structure from the
115 * CHP structure that represents the Character Property
116 * Information like bold, italic, striked, underlined,
117 * superscript, subscript, fontsize, color, fontface etc.
120 void
121 fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
123 ud->cCol = 0;
124 if (chp->ico)
125 ud->cCol = chp->ico - 1;
127 ud->cFontSize = chp->hps;
128 ud->bIsBold = (chp->fBold);
129 ud->bIsItalic = (chp->fItalic);
130 ud->bIsUl = (chp->kul);
131 ud->bIsStrike = (chp->fStrike);
132 ud->bIsSup = (chp->iss == 1);
133 ud->bIsSub = (chp->iss == 2);
135 if (ud->bIsBold || ud->bIsItalic || ud->bIsUl || ud->bIsSup || ud->bIsSub)
136 ud->bIsHot = 1;
137 else
138 ud->bIsHot = 0;
141 /* This is a callback that handles the individual
142 * character that are extracted from M$ word file.
144 static int
145 charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
148 /* convert incoming character to unicode */
149 if (chartype) {
150 eachchar = wvHandleCodePage (eachchar, lid);
153 /* take care of any oddities in Microsoft's character "encoding" */
154 /* TODO: does the above code page handler take care of these? */
155 if (chartype == 1 && eachchar == 146)
156 eachchar = 39; /* apostrophe */
158 switch (eachchar)
160 case 14: /* column break */
161 break;
163 case 19: /* field begin */
164 /* flush current text buffer */
165 ps->fieldstate++;
166 ps->fieldmiddle = 0;
167 return 0;
168 case 20: /* field separator */
169 ps->fieldmiddle = 1;
170 return 0;
171 case 21: /* field end */
172 ps->fieldstate--;
173 ps->fieldmiddle = 0;
174 return 0;
176 default:
177 break;
180 if (eachchar == 0x14)
181 return 0;
183 append_char (ps->userData, eachchar);
184 return 0;
187 /* This is a callback that handles the special
188 * character that are specific to M$ word file.
190 static int
191 specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
193 Blip blip;
194 wvStream *fil;
195 long pos;
196 FSPA *fspa;
197 PICF picf;
198 FDOA *fdoa;
200 switch (eachchar)
202 case 19: /* field begin */
203 ps->fieldstate++;
204 ps->fieldmiddle = 0;
205 return 0;
206 case 20: /* field separator */
207 if (achp->fOle2)
209 /* printf ("Field has an embedded OLE2 object\n"); */
211 ps->fieldmiddle = 1;
212 return 0;
213 case 21: /* field end */
214 ps->fieldstate--;
215 ps->fieldmiddle = 0;
216 return 0;
217 default:
218 break;
221 if (ps->fieldstate)
223 if (eachchar == 0x13 || eachchar == 0x14)
224 return 0;
227 return 0;
230 /* This is a callback that handles the individual
231 * elements that are marked by libwv1.
234 static int
235 eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
237 /* some word structures */
238 PAP *apap;
239 CHP *achp;
240 SEP *asep;
241 int iRes;
243 UserData *ud = (UserData *) ps->userData;
245 switch (tag)
247 case SECTIONEND:
248 case PARAEND: /* pretty much nothing */
249 append_char (ud, '\n');
250 break;
252 case CHARPROPBEGIN:
253 achp = (CHP *) props;
254 fill_UserData (ud, achp, ps);
255 break;
257 case CHARPROPEND:
258 achp = (CHP *) props;
259 fill_UserData (ud, achp, ps);
260 break;
262 default:
263 break;
266 return 0;
269 /* This is a callback that handles the document
270 * level tags that are marked by libwv1.
273 static int
274 docProc (wvParseStruct * ps, wvTag tag)
276 switch (tag)
278 case DOCEND:
279 append_char (ps->userData, 0x00);
280 break;
282 default:
283 break;
286 return 0;
290 * wv1_glue_init_doc_parsing: Initiates the document parsing
291 * procedure. Sets up all the required handlers and the parser.
293 * fname: Name of the file to parse. (essentially a M$ word file)
295 * wvTextHandlerCallback: The callback routine that will be called
296 * on extraction of each word.
298 * Return: 0 -> success
299 * -1 -> failure.
303 wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
305 FILE *input;
306 int ret;
308 wvParseStruct ps;
309 char *dir = NULL;
311 UserData ud;
313 input = fopen (fname, "rb");
314 if (!input)
315 return -1;
316 fclose (input);
318 wvInit ();
319 ret = wvInitParser (&ps, fname);
320 if (ret & 0x8000)
321 return -2;
322 else if (ret)
323 return -3;
325 ps.filename = fname;
326 ps.dir = dir;
328 /* set to 0 */
329 memset (&ud, 0, sizeof (UserData));
330 ud.WordHandler = callback;
331 ud.txtWord = g_string_sized_new (32);
332 ps.userData = &ud;
335 wvSetElementHandler (&ps, eleProc);
336 wvSetDocumentHandler (&ps, docProc);
337 wvSetCharHandler (&ps, charProc);
338 wvSetSpecialCharHandler (&ps, specCharProc);
340 wvText (&ps);
342 /* free associated memory */
343 wvOLEFree (&ps);
345 /* free userdata memory */
346 g_string_free (ud.txtWord, TRUE);
348 return 0;
351 void *
352 wv1_glue_get_ole_stream (const char* fname)
354 MsOle *ole = NULL;
355 ms_ole_open (&ole, fname);
356 return ((void *)ole);
359 void *
360 wv1_glue_get_ole_summary_stream (MsOle *stream)
362 MsOle *oleStream = (MsOle *)stream;
363 MsOleSummary *summary = NULL;
364 summary = ms_ole_summary_open (oleStream);
365 return ((void *)summary);
368 char *
369 wv1_glue_get_title (MsOleSummary* smryStream)
371 int ret;
372 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TITLE, &ret));
375 char *
376 wv1_glue_get_subject (MsOleSummary* smryStream)
378 int ret;
379 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_SUBJECT, &ret));
382 char *
383 wv1_glue_get_author (MsOleSummary* smryStream)
385 int ret;
386 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_AUTHOR, &ret));
389 char *
390 wv1_glue_get_keywords (MsOleSummary* smryStream)
392 int ret;
393 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_KEYWORDS, &ret));
396 char *
397 wv1_glue_get_comments (MsOleSummary* smryStream)
399 int ret;
400 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_COMMENTS, &ret));
403 char *
404 wv1_glue_get_template (MsOleSummary* smryStream)
406 int ret;
407 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TEMPLATE, &ret));
410 char *
411 wv1_glue_get_lastsavedby (MsOleSummary* smryStream)
413 int ret;
414 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_LASTAUTHOR, &ret));
417 char *
418 wv1_glue_get_revision_number (MsOleSummary* smryStream)
420 int ret;
421 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_REVNUMBER, &ret));
424 char *
425 wv1_glue_get_appname (MsOleSummary* smryStream)
427 int ret;
428 return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_APPNAME, &ret));
431 long
432 wv1_glue_get_page_count (MsOleSummary* smryStream)
434 int ret;
435 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_PAGECOUNT, &ret));
438 long
439 wv1_glue_get_word_count (MsOleSummary* smryStream)
441 int ret;
442 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_WORDCOUNT, &ret));
445 long
446 wv1_glue_get_character_count (MsOleSummary* smryStream)
448 int ret;
449 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_CHARCOUNT, &ret));
452 long
453 wv1_glue_get_security (MsOleSummary* smryStream)
455 int ret;
456 return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_SECURITY, &ret));
459 short
460 wv1_glue_get_codepage (MsOleSummary* smryStream)
462 int ret;
463 return (ms_ole_summary_get_short (smryStream, MS_OLE_SUMMARY_CODEPAGE, &ret));
466 void
467 wv1_glue_close_stream (MsOle* oleStream, MsOleSummary* summary)
469 ms_ole_summary_close (summary);
470 ms_ole_destroy (&oleStream);